Home | History | Annotate | Download | only in loader
      1 /*
      2     Copyright (C) 1999 Lars Knoll (knoll (at) mpi-hd.mpg.de)
      3     Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
      4     Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap (at) nypop.com)
      5 
      6     This library is free software; you can redistribute it and/or
      7     modify it under the terms of the GNU Library General Public
      8     License as published by the Free Software Foundation; either
      9     version 2 of the License, or (at your option) any later version.
     10 
     11     This library is distributed in the hope that it will be useful,
     12     but WITHOUT ANY WARRANTY; without even the implied warranty of
     13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14     Library General Public License for more details.
     15 
     16     You should have received a copy of the GNU Library General Public License
     17     along with this library; see the file COPYING.LIB.  If not, write to
     18     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     19     Boston, MA 02110-1301, USA.
     20 */
     21 
     22 
     23 #include "config.h"
     24 #include "TextResourceDecoder.h"
     25 
     26 #include "DOMImplementation.h"
     27 #include "HTMLNames.h"
     28 #include "TextCodec.h"
     29 #include "TextEncoding.h"
     30 #include "TextEncodingDetector.h"
     31 #include "TextEncodingRegistry.h"
     32 #include <wtf/ASCIICType.h>
     33 #include <wtf/StringExtras.h>
     34 
     35 using namespace WTF;
     36 
     37 namespace WebCore {
     38 
     39 using namespace HTMLNames;
     40 
     41 // You might think we should put these find functions elsewhere, perhaps with the
     42 // similar functions that operate on UChar, but arguably only the decoder has
     43 // a reason to process strings of char rather than UChar.
     44 
     45 static int find(const char* subject, size_t subjectLength, const char* target)
     46 {
     47     size_t targetLength = strlen(target);
     48     if (targetLength > subjectLength)
     49         return -1;
     50     for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
     51         bool match = true;
     52         for (size_t j = 0; j < targetLength; ++j) {
     53             if (subject[i + j] != target[j]) {
     54                 match = false;
     55                 break;
     56             }
     57         }
     58         if (match)
     59             return i;
     60     }
     61     return -1;
     62 }
     63 
     64 static int findIgnoringCase(const char* subject, size_t subjectLength, const char* target)
     65 {
     66     size_t targetLength = strlen(target);
     67     if (targetLength > subjectLength)
     68         return -1;
     69 #ifndef NDEBUG
     70     for (size_t i = 0; i < targetLength; ++i)
     71         ASSERT(isASCIILower(target[i]));
     72 #endif
     73     for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
     74         bool match = true;
     75         for (size_t j = 0; j < targetLength; ++j) {
     76             if (toASCIILower(subject[i + j]) != target[j]) {
     77                 match = false;
     78                 break;
     79             }
     80         }
     81         if (match)
     82             return i;
     83     }
     84     return -1;
     85 }
     86 
     87 static TextEncoding findTextEncoding(const char* encodingName, int length)
     88 {
     89     Vector<char, 64> buffer(length + 1);
     90     memcpy(buffer.data(), encodingName, length);
     91     buffer[length] = '\0';
     92     return buffer.data();
     93 }
     94 
     95 class KanjiCode {
     96 public:
     97     enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };
     98     static enum Type judge(const char* str, int length);
     99     static const int ESC = 0x1b;
    100     static const unsigned char sjisMap[256];
    101     static int ISkanji(int code)
    102     {
    103         if (code >= 0x100)
    104             return 0;
    105         return sjisMap[code & 0xff] & 1;
    106     }
    107     static int ISkana(int code)
    108     {
    109         if (code >= 0x100)
    110             return 0;
    111         return sjisMap[code & 0xff] & 2;
    112     }
    113 };
    114 
    115 const unsigned char KanjiCode::sjisMap[256] = {
    116     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    117     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    118     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    119     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    120     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    121     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    122     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    123     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    124     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    125     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    126     0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    127     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    128     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    129     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    130     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    131     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
    132 };
    133 
    134 /*
    135  * EUC-JP is
    136  *     [0xa1 - 0xfe][0xa1 - 0xfe]
    137  *     0x8e[0xa1 - 0xfe](SS2)
    138  *     0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
    139  *
    140  * Shift_Jis is
    141  *     [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
    142  *
    143  * Shift_Jis Hankaku Kana is
    144  *     [0xa1 - 0xdf]
    145  */
    146 
    147 /*
    148  * KanjiCode::judge() is based on judge_jcode() from jvim
    149  *     http://hp.vector.co.jp/authors/VA003457/vim/
    150  *
    151  * Special Thanks to Kenichi Tsuchida
    152  */
    153 
    154 enum KanjiCode::Type KanjiCode::judge(const char* str, int size)
    155 {
    156     enum Type code;
    157     int i;
    158     int bfr = false;            /* Kana Moji */
    159     int bfk = 0;                /* EUC Kana */
    160     int sjis = 0;
    161     int euc = 0;
    162 
    163     const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);
    164 
    165     code = ASCII;
    166 
    167     i = 0;
    168     while (i < size) {
    169         if (ptr[i] == ESC && (size - i >= 3)) {
    170             if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')
    171             || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {
    172                 code = JIS;
    173                 goto breakBreak;
    174             } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')
    175                     || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {
    176                 code = JIS;
    177                 goto breakBreak;
    178             } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {
    179                 code = JIS;
    180                 i += 3;
    181             } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {
    182                 code = JIS;
    183                 i += 3;
    184             } else {
    185                 i++;
    186             }
    187             bfr = false;
    188             bfk = 0;
    189         } else {
    190             if (ptr[i] < 0x20) {
    191                 bfr = false;
    192                 bfk = 0;
    193                 /* ?? check kudokuten ?? && ?? hiragana ?? */
    194                 if ((i >= 2) && (ptr[i - 2] == 0x81)
    195                         && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
    196                     code = SJIS;
    197                     sjis += 100;        /* kudokuten */
    198                 } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
    199                         && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
    200                     code = EUC;
    201                     euc += 100;         /* kudokuten */
    202                 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
    203                     sjis += 40;         /* hiragana */
    204                 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
    205                     euc += 40;          /* hiragana */
    206                 }
    207             } else {
    208                 /* ?? check hiragana or katana ?? */
    209                 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
    210                     sjis++;     /* hiragana */
    211                 } else if ((size - i > 1) && (ptr[i] == 0x83)
    212                          && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
    213                     sjis++;     /* katakana */
    214                 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
    215                     euc++;      /* hiragana */
    216                 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
    217                     euc++;      /* katakana */
    218                 }
    219                 if (bfr) {
    220                     if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
    221                         code = SJIS;
    222                         goto breakBreak;
    223                     } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
    224                         code = SJIS;
    225                         goto breakBreak;
    226                     } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
    227                         code = EUC;
    228                         goto breakBreak;
    229                     } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
    230                         code = EUC;
    231                         goto breakBreak;
    232                     } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
    233                         code = SJIS;
    234                         goto breakBreak;
    235                     } else if (ptr[i] <= 0x7f) {
    236                         code = SJIS;
    237                         goto breakBreak;
    238                     } else {
    239                         if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
    240                             euc++;      /* sjis hankaku kana kigo */
    241                         } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
    242                             ;           /* sjis hankaku kana */
    243                         } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
    244                             euc++;
    245                         } else if (0x8e == ptr[i]) {
    246                             euc++;
    247                         } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
    248                             sjis++;
    249                         }
    250                         bfr = false;
    251                         bfk = 0;
    252                     }
    253                 } else if (0x8e == ptr[i]) {
    254                     if (size - i <= 1) {
    255                         ;
    256                     } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
    257                         /* EUC KANA or SJIS KANJI */
    258                         if (bfk == 1) {
    259                             euc += 100;
    260                         }
    261                         bfk++;
    262                         i++;
    263                     } else {
    264                         /* SJIS only */
    265                         code = SJIS;
    266                         goto breakBreak;
    267                     }
    268                 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
    269                     /* SJIS only */
    270                     code = SJIS;
    271                     if ((size - i >= 1)
    272                             && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
    273                             || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
    274                         goto breakBreak;
    275                     }
    276                 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
    277                     /* EUC only */
    278                     code = EUC;
    279                     if ((size - i >= 1)
    280                             && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
    281                         goto breakBreak;
    282                     }
    283                 } else if (ptr[i] <= 0x7f) {
    284                     ;
    285                 } else {
    286                     bfr = true;
    287                     bfk = 0;
    288                 }
    289             }
    290             i++;
    291         }
    292     }
    293     if (code == ASCII) {
    294         if (sjis > euc) {
    295             code = SJIS;
    296         } else if (sjis < euc) {
    297             code = EUC;
    298         }
    299     }
    300 breakBreak:
    301     return (code);
    302 }
    303 
    304 TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)
    305 {
    306     if (equalIgnoringCase(mimeType, "text/css"))
    307         return CSS;
    308     if (equalIgnoringCase(mimeType, "text/html"))
    309         return HTML;
    310     if (DOMImplementation::isXMLMIMEType(mimeType))
    311         return XML;
    312     return PlainText;
    313 }
    314 
    315 const TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const TextEncoding& specifiedDefaultEncoding)
    316 {
    317     // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
    318     // for text/xml. This matches Firefox.
    319     if (contentType == XML)
    320         return UTF8Encoding();
    321     if (!specifiedDefaultEncoding.isValid())
    322         return Latin1Encoding();
    323     return specifiedDefaultEncoding;
    324 }
    325 
    326 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector)
    327     : m_contentType(determineContentType(mimeType))
    328     , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
    329     , m_source(DefaultEncoding)
    330     , m_hintEncoding(0)
    331     , m_checkedForBOM(false)
    332     , m_checkedForCSSCharset(false)
    333     , m_checkedForHeadCharset(false)
    334     , m_useLenientXMLDecoding(false)
    335     , m_sawError(false)
    336     , m_usesEncodingDetector(usesEncodingDetector)
    337 {
    338 }
    339 
    340 TextResourceDecoder::~TextResourceDecoder()
    341 {
    342 }
    343 
    344 void TextResourceDecoder::setEncoding(const TextEncoding& encoding, EncodingSource source)
    345 {
    346     // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
    347     if (!encoding.isValid())
    348         return;
    349 
    350     // When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR),
    351     // treat x-user-defined as windows-1252 (bug 18270)
    352     if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0)
    353         m_encoding = "windows-1252";
    354     else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset)
    355         m_encoding = encoding.closestByteBasedEquivalent();
    356     else
    357         m_encoding = encoding;
    358 
    359     m_codec.clear();
    360     m_source = source;
    361 }
    362 
    363 // Returns the position of the encoding string.
    364 static int findXMLEncoding(const char* str, int len, int& encodingLength)
    365 {
    366     int pos = find(str, len, "encoding");
    367     if (pos == -1)
    368         return -1;
    369     pos += 8;
    370 
    371     // Skip spaces and stray control characters.
    372     while (pos < len && str[pos] <= ' ')
    373         ++pos;
    374 
    375     // Skip equals sign.
    376     if (pos >= len || str[pos] != '=')
    377         return -1;
    378     ++pos;
    379 
    380     // Skip spaces and stray control characters.
    381     while (pos < len && str[pos] <= ' ')
    382         ++pos;
    383 
    384     // Skip quotation mark.
    385     if (pos >= len)
    386         return - 1;
    387     char quoteMark = str[pos];
    388     if (quoteMark != '"' && quoteMark != '\'')
    389         return -1;
    390     ++pos;
    391 
    392     // Find the trailing quotation mark.
    393     int end = pos;
    394     while (end < len && str[end] != quoteMark)
    395         ++end;
    396     if (end >= len)
    397         return -1;
    398 
    399     encodingLength = end - pos;
    400     return pos;
    401 }
    402 
    403 // true if there is more to parse
    404 static inline bool skipWhitespace(const char*& pos, const char* dataEnd)
    405 {
    406     while (pos < dataEnd && (*pos == '\t' || *pos == ' '))
    407         ++pos;
    408     return pos != dataEnd;
    409 }
    410 
    411 size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)
    412 {
    413     // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
    414     // We let it override even a user-chosen encoding.
    415     ASSERT(!m_checkedForBOM);
    416 
    417     size_t lengthOfBOM = 0;
    418 
    419     size_t bufferLength = m_buffer.size();
    420 
    421     size_t buf1Len = bufferLength;
    422     size_t buf2Len = len;
    423     const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data());
    424     const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
    425     unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    426     unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    427     unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    428     unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
    429 
    430     // Check for the BOM.
    431     if (c1 == 0xFF && c2 == 0xFE) {
    432         if (c3 != 0 || c4 != 0) {
    433             setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
    434             lengthOfBOM = 2;
    435         } else {
    436             setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
    437             lengthOfBOM = 4;
    438         }
    439     } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
    440         setEncoding(UTF8Encoding(), AutoDetectedEncoding);
    441         lengthOfBOM = 3;
    442     } else if (c1 == 0xFE && c2 == 0xFF) {
    443         setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
    444         lengthOfBOM = 2;
    445     } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
    446         setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
    447         lengthOfBOM = 4;
    448     }
    449 
    450     if (lengthOfBOM || bufferLength + len >= 4)
    451         m_checkedForBOM = true;
    452 
    453     return lengthOfBOM;
    454 }
    455 
    456 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
    457 {
    458     if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
    459         m_checkedForCSSCharset = true;
    460         return true;
    461     }
    462 
    463     size_t oldSize = m_buffer.size();
    464     m_buffer.grow(oldSize + len);
    465     memcpy(m_buffer.data() + oldSize, data, len);
    466 
    467     movedDataToBuffer = true;
    468 
    469     if (m_buffer.size() > 8) { // strlen("@charset") == 8
    470         const char* dataStart = m_buffer.data();
    471         const char* dataEnd = dataStart + m_buffer.size();
    472 
    473         if (dataStart[0] == '@' && dataStart[1] == 'c' && dataStart[2] == 'h' && dataStart[3] == 'a' && dataStart[4] == 'r' &&
    474             dataStart[5] == 's' && dataStart[6] == 'e' && dataStart[7] == 't') {
    475 
    476             dataStart += 8;
    477             const char* pos = dataStart;
    478             if (!skipWhitespace(pos, dataEnd))
    479                 return false;
    480 
    481             if (*pos == '"' || *pos == '\'') {
    482                 char quotationMark = *pos;
    483                 ++pos;
    484                 dataStart = pos;
    485 
    486                 while (pos < dataEnd && *pos != quotationMark)
    487                     ++pos;
    488                 if (pos == dataEnd)
    489                     return false;
    490 
    491                 int encodingNameLength = pos - dataStart + 1;
    492 
    493                 ++pos;
    494                 if (!skipWhitespace(pos, dataEnd))
    495                     return false;
    496 
    497                 if (*pos == ';')
    498                     setEncoding(findTextEncoding(dataStart, encodingNameLength), EncodingFromCSSCharset);
    499             }
    500         }
    501         m_checkedForCSSCharset = true;
    502         return true;
    503     }
    504     return false;
    505 }
    506 
    507 // Other browsers allow comments in the head section, so we need to also.
    508 // It's important not to look for tags inside the comments.
    509 static inline void skipComment(const char*& ptr, const char* pEnd)
    510 {
    511     const char* p = ptr;
    512     if (p == pEnd)
    513       return;
    514     // Allow <!-->; other browsers do.
    515     if (*p == '>') {
    516         p++;
    517     } else {
    518         while (p + 2 < pEnd) {
    519             if (*p == '-') {
    520                 // This is the real end of comment, "-->".
    521                 if (p[1] == '-' && p[2] == '>') {
    522                     p += 3;
    523                     break;
    524                 }
    525                 // This is the incorrect end of comment that other browsers allow, "--!>".
    526                 if (p + 3 < pEnd && p[1] == '-' && p[2] == '!' && p[3] == '>') {
    527                     p += 4;
    528                     break;
    529                 }
    530             }
    531             p++;
    532         }
    533     }
    534     ptr = p;
    535 }
    536 
    537 const int bytesToCheckUnconditionally = 1024; // That many input bytes will be checked for meta charset even if <head> section is over.
    538 
    539 bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool& movedDataToBuffer)
    540 {
    541     if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
    542         m_checkedForHeadCharset = true;
    543         return true;
    544     }
    545 
    546     // This is not completely efficient, since the function might go
    547     // through the HTML head several times.
    548 
    549     size_t oldSize = m_buffer.size();
    550     m_buffer.grow(oldSize + len);
    551     memcpy(m_buffer.data() + oldSize, data, len);
    552 
    553     movedDataToBuffer = true;
    554 
    555     const char* ptr = m_buffer.data();
    556     const char* pEnd = ptr + m_buffer.size();
    557 
    558     // Is there enough data available to check for XML declaration?
    559     if (m_buffer.size() < 8)
    560         return false;
    561 
    562     // Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents.
    563     // It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case.
    564     if (ptr[0] == '<' && ptr[1] == '?' && ptr[2] == 'x' && ptr[3] == 'm' && ptr[4] == 'l') {
    565         const char* xmlDeclarationEnd = ptr;
    566         while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
    567             ++xmlDeclarationEnd;
    568         if (xmlDeclarationEnd == pEnd)
    569             return false;
    570         // No need for +1, because we have an extra "?" to lose at the end of XML declaration.
    571         int len;
    572         int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);
    573         if (pos != -1)
    574             setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader);
    575         // continue looking for a charset - it may be specified in an HTTP-Equiv meta
    576     } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == '?' && ptr[3] == 0 && ptr[4] == 'x' && ptr[5] == 0) {
    577         setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
    578         return true;
    579     } else if (ptr[0] == 0 && ptr[1] == '<' && ptr[2] == 0 && ptr[3] == '?' && ptr[4] == 0 && ptr[5] == 'x') {
    580         setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
    581         return true;
    582     } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0 && ptr[4] == '?' && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == 0) {
    583         setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
    584         return true;
    585     } else if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == '<' && ptr[4] == 0 && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == '?') {
    586         setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
    587         return true;
    588     }
    589 
    590     // we still don't have an encoding, and are in the head
    591     // the following tags are allowed in <head>:
    592     // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
    593 
    594     // We stop scanning when a tag that is not permitted in <head>
    595     // is seen, rather when </head> is seen, because that more closely
    596     // matches behavior in other browsers; more details in
    597     // <http://bugs.webkit.org/show_bug.cgi?id=3590>.
    598 
    599     // Additionally, we ignore things that looks like tags in <title>, <script> and <noscript>; see
    600     // <http://bugs.webkit.org/show_bug.cgi?id=4560>, <http://bugs.webkit.org/show_bug.cgi?id=12165>
    601     // and <http://bugs.webkit.org/show_bug.cgi?id=12389>.
    602 
    603     // Since many sites have charset declarations after <body> or other tags that are disallowed in <head>,
    604     // we don't bail out until we've checked at least bytesToCheckUnconditionally bytes of input.
    605 
    606     AtomicStringImpl* enclosingTagName = 0;
    607     bool inHeadSection = true; // Becomes false when </head> or any tag not allowed in head is encountered.
    608 
    609     // the HTTP-EQUIV meta has no effect on XHTML
    610     if (m_contentType == XML)
    611         return true;
    612 
    613     while (ptr + 3 < pEnd) { // +3 guarantees that "<!--" fits in the buffer - and certainly we aren't going to lose any "charset" that way.
    614         if (*ptr == '<') {
    615             bool end = false;
    616             ptr++;
    617 
    618             // Handle comments.
    619             if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {
    620                 ptr += 3;
    621                 skipComment(ptr, pEnd);
    622                 if (ptr - m_buffer.data() >= bytesToCheckUnconditionally && !inHeadSection) {
    623                     // Some pages that test bandwidth from within the browser do it by having
    624                     // huge comments and measuring the time they take to load. Repeatedly scanning
    625                     // these comments can take a lot of CPU time.
    626                     m_checkedForHeadCharset = true;
    627                     return true;
    628                 }
    629                 continue;
    630             }
    631 
    632             if (*ptr == '/') {
    633                 ++ptr;
    634                 end = true;
    635             }
    636 
    637             // Grab the tag name, but mostly ignore namespaces.
    638             bool sawNamespace = false;
    639             char tagBuffer[20];
    640             int len = 0;
    641             while (len < 19) {
    642                 if (ptr == pEnd)
    643                     return false;
    644                 char c = *ptr;
    645                 if (c == ':') {
    646                     len = 0;
    647                     sawNamespace = true;
    648                     ptr++;
    649                     continue;
    650                 }
    651                 if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'))
    652                     ;
    653                 else if (c >= 'A' && c <= 'Z')
    654                     c += 'a' - 'A';
    655                 else
    656                     break;
    657                 tagBuffer[len++] = c;
    658                 ptr++;
    659             }
    660             tagBuffer[len] = 0;
    661             AtomicString tag(tagBuffer);
    662 
    663             if (enclosingTagName) {
    664                 if (end && tag.impl() == enclosingTagName)
    665                     enclosingTagName = 0;
    666             } else {
    667                 if (tag == titleTag)
    668                     enclosingTagName = titleTag.localName().impl();
    669                 else if (tag == scriptTag)
    670                     enclosingTagName = scriptTag.localName().impl();
    671                 else if (tag == noscriptTag)
    672                     enclosingTagName = noscriptTag.localName().impl();
    673             }
    674 
    675             // Find where the opening tag ends.
    676             const char* tagContentStart = ptr;
    677             if (!end) {
    678                 while (ptr != pEnd && *ptr != '>') {
    679                     if (*ptr == '\'' || *ptr == '"') {
    680                         char quoteMark = *ptr;
    681                         ++ptr;
    682                         while (ptr != pEnd && *ptr != quoteMark)
    683                             ++ptr;
    684                         if (ptr == pEnd)
    685                             return false;
    686                     }
    687                     ++ptr;
    688                 }
    689                 if (ptr == pEnd)
    690                     return false;
    691                 ++ptr;
    692             }
    693 
    694             if (!end && tag == metaTag && !sawNamespace) {
    695                 const char* str = tagContentStart;
    696                 int length = ptr - tagContentStart;
    697                 int pos = 0;
    698                 while (pos < length) {
    699                     int charsetPos = findIgnoringCase(str + pos, length - pos, "charset");
    700                     if (charsetPos == -1)
    701                         break;
    702                     pos += charsetPos + 7;
    703                     // skip whitespace
    704                     while (pos < length && str[pos] <= ' ')
    705                         pos++;
    706                     if (pos == length)
    707                         break;
    708                     if (str[pos++] != '=')
    709                         continue;
    710                     while ((pos < length) &&
    711                             (str[pos] <= ' ' || str[pos] == '=' || str[pos] == '"' || str[pos] == '\''))
    712                         pos++;
    713 
    714                     // end ?
    715                     if (pos == length)
    716                         break;
    717                     int end = pos;
    718                     while (end < length &&
    719                            str[end] != ' ' && str[end] != '"' && str[end] != '\'' &&
    720                            str[end] != ';' && str[end] != '>')
    721                         end++;
    722                     setEncoding(findTextEncoding(str + pos, end - pos), EncodingFromMetaTag);
    723                     if (m_source == EncodingFromMetaTag)
    724                         return true;
    725 
    726                     if (end >= length || str[end] == '/' || str[end] == '>')
    727                         break;
    728 
    729                     pos = end + 1;
    730                 }
    731             } else {
    732                 if (!enclosingTagName && tag != scriptTag && tag != noscriptTag && tag != styleTag
    733                     && tag != linkTag && tag != metaTag && tag != objectTag && tag != titleTag && tag != baseTag
    734                     && (end || tag != htmlTag) && (end || tag != headTag) && isASCIIAlpha(tagBuffer[0])) {
    735                     inHeadSection = false;
    736                 }
    737 
    738                 if (ptr - m_buffer.data() >= bytesToCheckUnconditionally && !inHeadSection) {
    739                     m_checkedForHeadCharset = true;
    740                     return true;
    741                 }
    742             }
    743         } else
    744             ++ptr;
    745     }
    746     return false;
    747 }
    748 
    749 void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
    750 {
    751     switch (KanjiCode::judge(data, len)) {
    752         case KanjiCode::JIS:
    753             setEncoding("ISO-2022-JP", AutoDetectedEncoding);
    754             break;
    755         case KanjiCode::EUC:
    756             setEncoding("EUC-JP", AutoDetectedEncoding);
    757             break;
    758         case KanjiCode::SJIS:
    759             setEncoding("Shift_JIS", AutoDetectedEncoding);
    760             break;
    761         case KanjiCode::ASCII:
    762         case KanjiCode::UTF16:
    763         case KanjiCode::UTF8:
    764             break;
    765     }
    766 }
    767 
    768 // We use the encoding detector in two cases:
    769 //   1. Encoding detector is turned ON and no other encoding source is
    770 //      available (that is, it's DefaultEncoding).
    771 //   2. Encoding detector is turned ON and the encoding is set to
    772 //      the encoding of the parent frame, which is also auto-detected.
    773 //   Note that condition #2 is NOT satisfied unless parent-child frame
    774 //   relationship is compliant to the same-origin policy. If they're from
    775 //   different domains, |m_source| would not be set to EncodingFromParentFrame
    776 //   in the first place.
    777 bool TextResourceDecoder::shouldAutoDetect() const
    778 {
    779     // Just checking m_hintEncoding suffices here because it's only set
    780     // in setHintEncoding when the source is AutoDetectedEncoding.
    781     return m_usesEncodingDetector
    782         && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
    783 }
    784 
    785 String TextResourceDecoder::decode(const char* data, size_t len)
    786 {
    787     size_t lengthOfBOM = 0;
    788     if (!m_checkedForBOM)
    789         lengthOfBOM = checkForBOM(data, len);
    790 
    791     bool movedDataToBuffer = false;
    792 
    793     if (m_contentType == CSS && !m_checkedForCSSCharset)
    794         if (!checkForCSSCharset(data, len, movedDataToBuffer))
    795             return "";
    796 
    797     if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForHeadCharset) // HTML and XML
    798         if (!checkForHeadCharset(data, len, movedDataToBuffer))
    799             return "";
    800 
    801     // FIXME: It is wrong to change the encoding downstream after we have already done some decoding.
    802     if (shouldAutoDetect()) {
    803         if (m_encoding.isJapanese())
    804             detectJapaneseEncoding(data, len); // FIXME: We should use detectTextEncoding() for all languages.
    805         else {
    806             TextEncoding detectedEncoding;
    807             if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
    808                 setEncoding(detectedEncoding, AutoDetectedEncoding);
    809         }
    810     }
    811 
    812     ASSERT(m_encoding.isValid());
    813 
    814     if (!m_codec)
    815         m_codec.set(newTextCodec(m_encoding).release());
    816 
    817     if (m_buffer.isEmpty())
    818         return m_codec->decode(data + lengthOfBOM, len - lengthOfBOM, false, m_contentType == XML, m_sawError);
    819 
    820     if (!movedDataToBuffer) {
    821         size_t oldSize = m_buffer.size();
    822         m_buffer.grow(oldSize + len);
    823         memcpy(m_buffer.data() + oldSize, data, len);
    824     }
    825 
    826     String result = m_codec->decode(m_buffer.data() + lengthOfBOM, m_buffer.size() - lengthOfBOM, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
    827     m_buffer.clear();
    828     return result;
    829 }
    830 
    831 String TextResourceDecoder::flush()
    832 {
    833    // If we can not identify the encoding even after a document is completely
    834    // loaded, we need to detect the encoding if other conditions for
    835    // autodetection is satisfied.
    836     if (m_buffer.size() && shouldAutoDetect()
    837         && ((!m_checkedForHeadCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) {
    838          TextEncoding detectedEncoding;
    839          if (detectTextEncoding(m_buffer.data(), m_buffer.size(),
    840                                 m_hintEncoding, &detectedEncoding))
    841              setEncoding(detectedEncoding, AutoDetectedEncoding);
    842     }
    843 
    844     if (!m_codec)
    845         m_codec.set(newTextCodec(m_encoding).release());
    846 
    847     String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
    848     m_buffer.clear();
    849     m_codec.clear();
    850     m_checkedForBOM = false; // Skip BOM again when re-decoding.
    851     return result;
    852 }
    853 
    854 }
    855