Home | History | Annotate | Download | only in mhtml
      1 /*
      2  * Copyright (C) 2011 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *     * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *     * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *     * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 #include "config.h"
     32 #include "platform/mhtml/MHTMLParser.h"
     33 
     34 #include "platform/MIMETypeRegistry.h"
     35 #include "platform/mhtml/MHTMLArchive.h"
     36 #include "platform/network/ParsedContentType.h"
     37 #include "platform/text/QuotedPrintable.h"
     38 #include "wtf/HashMap.h"
     39 #include "wtf/RefCounted.h"
     40 #include "wtf/text/Base64.h"
     41 #include "wtf/text/StringBuilder.h"
     42 #include "wtf/text/StringConcatenate.h"
     43 #include "wtf/text/StringHash.h"
     44 #include "wtf/text/WTFString.h"
     45 
     46 namespace WebCore {
     47 
     48 // This class is a limited MIME parser used to parse the MIME headers of MHTML files.
     49 class MIMEHeader : public RefCounted<MIMEHeader> {
     50 public:
     51     enum Encoding {
     52         QuotedPrintable,
     53         Base64,
     54         EightBit,
     55         SevenBit,
     56         Binary,
     57         Unknown
     58     };
     59 
     60     static PassRefPtr<MIMEHeader> parseHeader(SharedBufferChunkReader* crLFLineReader);
     61 
     62     bool isMultipart() const { return m_contentType.startsWith("multipart/"); }
     63 
     64     String contentType() const { return m_contentType; }
     65     String charset() const { return m_charset; }
     66     Encoding contentTransferEncoding() const { return m_contentTransferEncoding; }
     67     String contentLocation() const { return m_contentLocation; }
     68 
     69     // Multi-part type and boundaries are only valid for multipart MIME headers.
     70     String multiPartType() const { return m_multipartType; }
     71     String endOfPartBoundary() const { return m_endOfPartBoundary; }
     72     String endOfDocumentBoundary() const { return m_endOfDocumentBoundary; }
     73 
     74 private:
     75     MIMEHeader();
     76 
     77     static Encoding parseContentTransferEncoding(const String&);
     78 
     79     String m_contentType;
     80     String m_charset;
     81     Encoding m_contentTransferEncoding;
     82     String m_contentLocation;
     83     String m_multipartType;
     84     String m_endOfPartBoundary;
     85     String m_endOfDocumentBoundary;
     86 };
     87 
     88 typedef HashMap<String, String> KeyValueMap;
     89 
     90 static KeyValueMap retrieveKeyValuePairs(WebCore::SharedBufferChunkReader* buffer)
     91 {
     92     KeyValueMap keyValuePairs;
     93     String line;
     94     String key;
     95     StringBuilder value;
     96     while (!(line = buffer->nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
     97         if (line.isEmpty())
     98             break; // Empty line means end of key/value section.
     99         if (line[0] == '\t') {
    100             ASSERT(!key.isEmpty());
    101             value.append(line.substring(1));
    102             continue;
    103         }
    104         // New key/value, store the previous one if any.
    105         if (!key.isEmpty()) {
    106             if (keyValuePairs.find(key) != keyValuePairs.end())
    107                 WTF_LOG_ERROR("Key duplicate found in MIME header. Key is '%s', previous value replaced.", key.ascii().data());
    108             keyValuePairs.add(key, value.toString().stripWhiteSpace());
    109             key = String();
    110             value.clear();
    111         }
    112         size_t semiColonIndex = line.find(':');
    113         if (semiColonIndex == kNotFound) {
    114             // This is not a key value pair, ignore.
    115             continue;
    116         }
    117         key = line.substring(0, semiColonIndex).lower().stripWhiteSpace();
    118         value.append(line.substring(semiColonIndex + 1));
    119     }
    120     // Store the last property if there is one.
    121     if (!key.isEmpty())
    122         keyValuePairs.set(key, value.toString().stripWhiteSpace());
    123     return keyValuePairs;
    124 }
    125 
    126 PassRefPtr<MIMEHeader> MIMEHeader::parseHeader(SharedBufferChunkReader* buffer)
    127 {
    128     RefPtr<MIMEHeader> mimeHeader = adoptRef(new MIMEHeader);
    129     KeyValueMap keyValuePairs = retrieveKeyValuePairs(buffer);
    130     KeyValueMap::iterator mimeParametersIterator = keyValuePairs.find("content-type");
    131     if (mimeParametersIterator != keyValuePairs.end()) {
    132         ParsedContentType parsedContentType(mimeParametersIterator->value);
    133         mimeHeader->m_contentType = parsedContentType.mimeType();
    134         if (!mimeHeader->isMultipart()) {
    135             mimeHeader->m_charset = parsedContentType.charset().stripWhiteSpace();
    136         } else {
    137             mimeHeader->m_multipartType = parsedContentType.parameterValueForName("type");
    138             mimeHeader->m_endOfPartBoundary = parsedContentType.parameterValueForName("boundary");
    139             if (mimeHeader->m_endOfPartBoundary.isNull()) {
    140                 WTF_LOG_ERROR("No boundary found in multipart MIME header.");
    141                 return 0;
    142             }
    143             mimeHeader->m_endOfPartBoundary.insert("--", 0);
    144             mimeHeader->m_endOfDocumentBoundary = mimeHeader->m_endOfPartBoundary;
    145             mimeHeader->m_endOfDocumentBoundary.append("--");
    146         }
    147     }
    148 
    149     mimeParametersIterator = keyValuePairs.find("content-transfer-encoding");
    150     if (mimeParametersIterator != keyValuePairs.end())
    151         mimeHeader->m_contentTransferEncoding = parseContentTransferEncoding(mimeParametersIterator->value);
    152 
    153     mimeParametersIterator = keyValuePairs.find("content-location");
    154     if (mimeParametersIterator != keyValuePairs.end())
    155         mimeHeader->m_contentLocation = mimeParametersIterator->value;
    156 
    157     return mimeHeader.release();
    158 }
    159 
    160 MIMEHeader::Encoding MIMEHeader::parseContentTransferEncoding(const String& text)
    161 {
    162     String encoding = text.stripWhiteSpace().lower();
    163     if (encoding == "base64")
    164         return Base64;
    165     if (encoding == "quoted-printable")
    166         return QuotedPrintable;
    167     if (encoding == "8bit")
    168         return EightBit;
    169     if (encoding == "7bit")
    170         return SevenBit;
    171     if (encoding == "binary")
    172         return Binary;
    173     WTF_LOG_ERROR("Unknown encoding '%s' found in MIME header.", text.ascii().data());
    174     return Unknown;
    175 }
    176 
    177 MIMEHeader::MIMEHeader()
    178     : m_contentTransferEncoding(Unknown)
    179 {
    180 }
    181 
    182 static bool skipLinesUntilBoundaryFound(SharedBufferChunkReader& lineReader, const String& boundary)
    183 {
    184     String line;
    185     while (!(line = lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
    186         if (line == boundary)
    187             return true;
    188     }
    189     return false;
    190 }
    191 
    192 MHTMLParser::MHTMLParser(SharedBuffer* data)
    193     : m_lineReader(data, "\r\n")
    194 {
    195 }
    196 
    197 PassRefPtr<MHTMLArchive> MHTMLParser::parseArchive()
    198 {
    199     RefPtr<MIMEHeader> header = MIMEHeader::parseHeader(&m_lineReader);
    200     return parseArchiveWithHeader(header.get());
    201 }
    202 
    203 PassRefPtr<MHTMLArchive> MHTMLParser::parseArchiveWithHeader(MIMEHeader* header)
    204 {
    205     if (!header) {
    206         WTF_LOG_ERROR("Failed to parse MHTML part: no header.");
    207         return 0;
    208     }
    209 
    210     RefPtr<MHTMLArchive> archive = MHTMLArchive::create();
    211     if (!header->isMultipart()) {
    212         // With IE a page with no resource is not multi-part.
    213         bool endOfArchiveReached = false;
    214         RefPtr<ArchiveResource> resource = parseNextPart(*header, String(), String(), endOfArchiveReached);
    215         if (!resource)
    216             return 0;
    217         archive->setMainResource(resource);
    218         return archive;
    219     }
    220 
    221     // Skip the message content (it's a generic browser specific message).
    222     skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
    223 
    224     bool endOfArchive = false;
    225     while (!endOfArchive) {
    226         RefPtr<MIMEHeader> resourceHeader = MIMEHeader::parseHeader(&m_lineReader);
    227         if (!resourceHeader) {
    228             WTF_LOG_ERROR("Failed to parse MHTML, invalid MIME header.");
    229             return 0;
    230         }
    231         if (resourceHeader->contentType() == "multipart/alternative") {
    232             // Ignore IE nesting which makes little sense (IE seems to nest only some of the frames).
    233             RefPtr<MHTMLArchive> subframeArchive = parseArchiveWithHeader(resourceHeader.get());
    234             if (!subframeArchive) {
    235                 WTF_LOG_ERROR("Failed to parse MHTML subframe.");
    236                 return 0;
    237             }
    238             bool endOfPartReached = skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
    239             ASSERT_UNUSED(endOfPartReached, endOfPartReached);
    240             // The top-frame is the first frame found, regardless of the nesting level.
    241             if (subframeArchive->mainResource())
    242                 addResourceToArchive(subframeArchive->mainResource(), archive.get());
    243             archive->addSubframeArchive(subframeArchive);
    244             continue;
    245         }
    246 
    247         RefPtr<ArchiveResource> resource = parseNextPart(*resourceHeader, header->endOfPartBoundary(), header->endOfDocumentBoundary(), endOfArchive);
    248         if (!resource) {
    249             WTF_LOG_ERROR("Failed to parse MHTML part.");
    250             return 0;
    251         }
    252         addResourceToArchive(resource.get(), archive.get());
    253     }
    254 
    255     return archive.release();
    256 }
    257 
    258 void MHTMLParser::addResourceToArchive(ArchiveResource* resource, MHTMLArchive* archive)
    259 {
    260     const AtomicString& mimeType = resource->mimeType();
    261     if (!MIMETypeRegistry::isSupportedNonImageMIMEType(mimeType) || MIMETypeRegistry::isSupportedJavaScriptMIMEType(mimeType) || mimeType == "text/css") {
    262         m_resources.append(resource);
    263         return;
    264     }
    265 
    266     // The first document suitable resource is the main frame.
    267     if (!archive->mainResource()) {
    268         archive->setMainResource(resource);
    269         m_frames.append(archive);
    270         return;
    271     }
    272 
    273     RefPtr<MHTMLArchive> subframe = MHTMLArchive::create();
    274     subframe->setMainResource(resource);
    275     m_frames.append(subframe);
    276 }
    277 
    278 PassRefPtr<ArchiveResource> MHTMLParser::parseNextPart(const MIMEHeader& mimeHeader, const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached)
    279 {
    280     ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty());
    281 
    282     // If no content transfer encoding is specified, default to binary encoding.
    283     MIMEHeader::Encoding contentTransferEncoding = mimeHeader.contentTransferEncoding();
    284     if (contentTransferEncoding == MIMEHeader::Unknown)
    285         contentTransferEncoding = MIMEHeader::Binary;
    286 
    287     RefPtr<SharedBuffer> content = SharedBuffer::create();
    288     const bool checkBoundary = !endOfPartBoundary.isEmpty();
    289     bool endOfPartReached = false;
    290     if (contentTransferEncoding == MIMEHeader::Binary) {
    291         if (!checkBoundary) {
    292             WTF_LOG_ERROR("Binary contents requires end of part");
    293             return 0;
    294         }
    295         m_lineReader.setSeparator(endOfPartBoundary.utf8().data());
    296         Vector<char> part;
    297         if (!m_lineReader.nextChunk(part)) {
    298             WTF_LOG_ERROR("Binary contents requires end of part");
    299             return 0;
    300         }
    301         content->append(part);
    302         m_lineReader.setSeparator("\r\n");
    303         Vector<char> nextChars;
    304         if (m_lineReader.peek(nextChars, 2) != 2) {
    305             WTF_LOG_ERROR("Invalid seperator.");
    306             return 0;
    307         }
    308         endOfPartReached = true;
    309         ASSERT(nextChars.size() == 2);
    310         endOfArchiveReached = (nextChars[0] == '-' && nextChars[1] == '-');
    311         if (!endOfArchiveReached) {
    312             String line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback();
    313             if (!line.isEmpty()) {
    314                 WTF_LOG_ERROR("No CRLF at end of binary section.");
    315                 return 0;
    316             }
    317         }
    318     } else {
    319         String line;
    320         while (!(line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
    321             endOfArchiveReached = (line == endOfDocumentBoundary);
    322             if (checkBoundary && (line == endOfPartBoundary || endOfArchiveReached)) {
    323                 endOfPartReached = true;
    324                 break;
    325             }
    326             // Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'.
    327             content->append(line.utf8().data(), line.length());
    328             if (contentTransferEncoding == MIMEHeader::QuotedPrintable) {
    329                 // The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines.
    330                 content->append("\r\n", 2);
    331             }
    332         }
    333     }
    334     if (!endOfPartReached && checkBoundary) {
    335         WTF_LOG_ERROR("No bounday found for MHTML part.");
    336         return 0;
    337     }
    338 
    339     Vector<char> data;
    340     switch (contentTransferEncoding) {
    341     case MIMEHeader::Base64:
    342         if (!base64Decode(content->data(), content->size(), data)) {
    343             WTF_LOG_ERROR("Invalid base64 content for MHTML part.");
    344             return 0;
    345         }
    346         break;
    347     case MIMEHeader::QuotedPrintable:
    348         quotedPrintableDecode(content->data(), content->size(), data);
    349         break;
    350     case MIMEHeader::EightBit:
    351     case MIMEHeader::SevenBit:
    352     case MIMEHeader::Binary:
    353         data.append(content->data(), content->size());
    354         break;
    355     default:
    356         WTF_LOG_ERROR("Invalid encoding for MHTML part.");
    357         return 0;
    358     }
    359     RefPtr<SharedBuffer> contentBuffer = SharedBuffer::adoptVector(data);
    360     // FIXME: the URL in the MIME header could be relative, we should resolve it if it is.
    361     // The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5
    362     // IE and Firefox (UNMht) seem to generate only absolute URLs.
    363     KURL location = KURL(KURL(), mimeHeader.contentLocation());
    364     return ArchiveResource::create(contentBuffer, location, AtomicString(mimeHeader.contentType()), AtomicString(mimeHeader.charset()), String());
    365 }
    366 
    367 size_t MHTMLParser::frameCount() const
    368 {
    369     return m_frames.size();
    370 }
    371 
    372 MHTMLArchive* MHTMLParser::frameAt(size_t index) const
    373 {
    374     return m_frames[index].get();
    375 }
    376 
    377 size_t MHTMLParser::subResourceCount() const
    378 {
    379     return m_resources.size();
    380 }
    381 
    382 ArchiveResource* MHTMLParser::subResourceAt(size_t index) const
    383 {
    384     return m_resources[index].get();
    385 }
    386 
    387 }
    388