Home | History | Annotate | Download | only in mhtml
      1 /*
      2  * Copyright (C) 2011 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *     * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *     * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *     * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 #include "config.h"
     32 #include "platform/mhtml/MHTMLParser.h"
     33 
     34 #include "platform/MIMETypeRegistry.h"
     35 #include "platform/mhtml/ArchiveResource.h"
     36 #include "platform/mhtml/MHTMLArchive.h"
     37 #include "platform/network/ParsedContentType.h"
     38 #include "platform/text/QuotedPrintable.h"
     39 #include "wtf/HashMap.h"
     40 #include "wtf/RefCounted.h"
     41 #include "wtf/text/Base64.h"
     42 #include "wtf/text/StringBuilder.h"
     43 #include "wtf/text/StringConcatenate.h"
     44 #include "wtf/text/StringHash.h"
     45 #include "wtf/text/WTFString.h"
     46 
     47 namespace blink {
     48 
     49 // This class is a limited MIME parser used to parse the MIME headers of MHTML files.
     50 class MIMEHeader : public RefCountedWillBeGarbageCollectedFinalized<MIMEHeader> {
     51 public:
     52     static PassRefPtrWillBeRawPtr<MIMEHeader> create()
     53     {
     54         return adoptRefWillBeNoop(new MIMEHeader());
     55     }
     56 
     57     enum Encoding {
     58         QuotedPrintable,
     59         Base64,
     60         EightBit,
     61         SevenBit,
     62         Binary,
     63         Unknown
     64     };
     65 
     66     static PassRefPtrWillBeRawPtr<MIMEHeader> parseHeader(SharedBufferChunkReader* crLFLineReader);
     67 
     68     bool isMultipart() const { return m_contentType.startsWith("multipart/"); }
     69 
     70     String contentType() const { return m_contentType; }
     71     String charset() const { return m_charset; }
     72     Encoding contentTransferEncoding() const { return m_contentTransferEncoding; }
     73     String contentLocation() const { return m_contentLocation; }
     74 
     75     // Multi-part type and boundaries are only valid for multipart MIME headers.
     76     String multiPartType() const { return m_multipartType; }
     77     String endOfPartBoundary() const { return m_endOfPartBoundary; }
     78     String endOfDocumentBoundary() const { return m_endOfDocumentBoundary; }
     79 
     80     void trace(Visitor*) { }
     81 
     82 private:
     83     MIMEHeader();
     84 
     85     static Encoding parseContentTransferEncoding(const String&);
     86 
     87     String m_contentType;
     88     String m_charset;
     89     Encoding m_contentTransferEncoding;
     90     String m_contentLocation;
     91     String m_multipartType;
     92     String m_endOfPartBoundary;
     93     String m_endOfDocumentBoundary;
     94 };
     95 
     96 typedef HashMap<String, String> KeyValueMap;
     97 
     98 static KeyValueMap retrieveKeyValuePairs(blink::SharedBufferChunkReader* buffer)
     99 {
    100     KeyValueMap keyValuePairs;
    101     String line;
    102     String key;
    103     StringBuilder value;
    104     while (!(line = buffer->nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
    105         if (line.isEmpty())
    106             break; // Empty line means end of key/value section.
    107         if (line[0] == '\t') {
    108             ASSERT(!key.isEmpty());
    109             value.append(line.substring(1));
    110             continue;
    111         }
    112         // New key/value, store the previous one if any.
    113         if (!key.isEmpty()) {
    114             if (keyValuePairs.find(key) != keyValuePairs.end())
    115                 WTF_LOG_ERROR("Key duplicate found in MIME header. Key is '%s', previous value replaced.", key.ascii().data());
    116             keyValuePairs.add(key, value.toString().stripWhiteSpace());
    117             key = String();
    118             value.clear();
    119         }
    120         size_t semiColonIndex = line.find(':');
    121         if (semiColonIndex == kNotFound) {
    122             // This is not a key value pair, ignore.
    123             continue;
    124         }
    125         key = line.substring(0, semiColonIndex).lower().stripWhiteSpace();
    126         value.append(line.substring(semiColonIndex + 1));
    127     }
    128     // Store the last property if there is one.
    129     if (!key.isEmpty())
    130         keyValuePairs.set(key, value.toString().stripWhiteSpace());
    131     return keyValuePairs;
    132 }
    133 
    134 PassRefPtrWillBeRawPtr<MIMEHeader> MIMEHeader::parseHeader(SharedBufferChunkReader* buffer)
    135 {
    136     RefPtrWillBeRawPtr<MIMEHeader> mimeHeader = MIMEHeader::create();
    137     KeyValueMap keyValuePairs = retrieveKeyValuePairs(buffer);
    138     KeyValueMap::iterator mimeParametersIterator = keyValuePairs.find("content-type");
    139     if (mimeParametersIterator != keyValuePairs.end()) {
    140         ParsedContentType parsedContentType(mimeParametersIterator->value);
    141         mimeHeader->m_contentType = parsedContentType.mimeType();
    142         if (!mimeHeader->isMultipart()) {
    143             mimeHeader->m_charset = parsedContentType.charset().stripWhiteSpace();
    144         } else {
    145             mimeHeader->m_multipartType = parsedContentType.parameterValueForName("type");
    146             mimeHeader->m_endOfPartBoundary = parsedContentType.parameterValueForName("boundary");
    147             if (mimeHeader->m_endOfPartBoundary.isNull()) {
    148                 WTF_LOG_ERROR("No boundary found in multipart MIME header.");
    149                 return nullptr;
    150             }
    151             mimeHeader->m_endOfPartBoundary.insert("--", 0);
    152             mimeHeader->m_endOfDocumentBoundary = mimeHeader->m_endOfPartBoundary;
    153             mimeHeader->m_endOfDocumentBoundary.append("--");
    154         }
    155     }
    156 
    157     mimeParametersIterator = keyValuePairs.find("content-transfer-encoding");
    158     if (mimeParametersIterator != keyValuePairs.end())
    159         mimeHeader->m_contentTransferEncoding = parseContentTransferEncoding(mimeParametersIterator->value);
    160 
    161     mimeParametersIterator = keyValuePairs.find("content-location");
    162     if (mimeParametersIterator != keyValuePairs.end())
    163         mimeHeader->m_contentLocation = mimeParametersIterator->value;
    164 
    165     return mimeHeader.release();
    166 }
    167 
    168 MIMEHeader::Encoding MIMEHeader::parseContentTransferEncoding(const String& text)
    169 {
    170     String encoding = text.stripWhiteSpace().lower();
    171     if (encoding == "base64")
    172         return Base64;
    173     if (encoding == "quoted-printable")
    174         return QuotedPrintable;
    175     if (encoding == "8bit")
    176         return EightBit;
    177     if (encoding == "7bit")
    178         return SevenBit;
    179     if (encoding == "binary")
    180         return Binary;
    181     WTF_LOG_ERROR("Unknown encoding '%s' found in MIME header.", text.ascii().data());
    182     return Unknown;
    183 }
    184 
    185 MIMEHeader::MIMEHeader()
    186     : m_contentTransferEncoding(Unknown)
    187 {
    188 }
    189 
    190 static bool skipLinesUntilBoundaryFound(SharedBufferChunkReader& lineReader, const String& boundary)
    191 {
    192     String line;
    193     while (!(line = lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
    194         if (line == boundary)
    195             return true;
    196     }
    197     return false;
    198 }
    199 
    200 MHTMLParser::MHTMLParser(SharedBuffer* data)
    201     : m_lineReader(data, "\r\n")
    202 {
    203 }
    204 
    205 PassRefPtrWillBeRawPtr<MHTMLArchive> MHTMLParser::parseArchive()
    206 {
    207     RefPtrWillBeRawPtr<MIMEHeader> header = MIMEHeader::parseHeader(&m_lineReader);
    208     return parseArchiveWithHeader(header.get());
    209 }
    210 
    211 PassRefPtrWillBeRawPtr<MHTMLArchive> MHTMLParser::parseArchiveWithHeader(MIMEHeader* header)
    212 {
    213     if (!header) {
    214         WTF_LOG_ERROR("Failed to parse MHTML part: no header.");
    215         return nullptr;
    216     }
    217 
    218     RefPtrWillBeRawPtr<MHTMLArchive> archive = MHTMLArchive::create();
    219     if (!header->isMultipart()) {
    220         // With IE a page with no resource is not multi-part.
    221         bool endOfArchiveReached = false;
    222         RefPtrWillBeRawPtr<ArchiveResource> resource = parseNextPart(*header, String(), String(), endOfArchiveReached);
    223         if (!resource)
    224             return nullptr;
    225         archive->setMainResource(resource);
    226         return archive;
    227     }
    228 
    229     // Skip the message content (it's a generic browser specific message).
    230     skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
    231 
    232     bool endOfArchive = false;
    233     while (!endOfArchive) {
    234         RefPtrWillBeRawPtr<MIMEHeader> resourceHeader = MIMEHeader::parseHeader(&m_lineReader);
    235         if (!resourceHeader) {
    236             WTF_LOG_ERROR("Failed to parse MHTML, invalid MIME header.");
    237             return nullptr;
    238         }
    239         if (resourceHeader->contentType() == "multipart/alternative") {
    240             // Ignore IE nesting which makes little sense (IE seems to nest only some of the frames).
    241             RefPtrWillBeRawPtr<MHTMLArchive> subframeArchive = parseArchiveWithHeader(resourceHeader.get());
    242             if (!subframeArchive) {
    243                 WTF_LOG_ERROR("Failed to parse MHTML subframe.");
    244                 return nullptr;
    245             }
    246             bool endOfPartReached = skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
    247             ASSERT_UNUSED(endOfPartReached, endOfPartReached);
    248             // The top-frame is the first frame found, regardless of the nesting level.
    249             if (subframeArchive->mainResource())
    250                 addResourceToArchive(subframeArchive->mainResource(), archive.get());
    251             archive->addSubframeArchive(subframeArchive);
    252             continue;
    253         }
    254 
    255         RefPtrWillBeRawPtr<ArchiveResource> resource = parseNextPart(*resourceHeader, header->endOfPartBoundary(), header->endOfDocumentBoundary(), endOfArchive);
    256         if (!resource) {
    257             WTF_LOG_ERROR("Failed to parse MHTML part.");
    258             return nullptr;
    259         }
    260         addResourceToArchive(resource.get(), archive.get());
    261     }
    262 
    263     return archive.release();
    264 }
    265 
    266 void MHTMLParser::addResourceToArchive(ArchiveResource* resource, MHTMLArchive* archive)
    267 {
    268     const AtomicString& mimeType = resource->mimeType();
    269     if (!MIMETypeRegistry::isSupportedNonImageMIMEType(mimeType) || MIMETypeRegistry::isSupportedJavaScriptMIMEType(mimeType) || mimeType == "text/css") {
    270         m_resources.append(resource);
    271         return;
    272     }
    273 
    274     // The first document suitable resource is the main frame.
    275     if (!archive->mainResource()) {
    276         archive->setMainResource(resource);
    277         m_frames.append(archive);
    278         return;
    279     }
    280 
    281     RefPtrWillBeRawPtr<MHTMLArchive> subframe = MHTMLArchive::create();
    282     subframe->setMainResource(resource);
    283     m_frames.append(subframe);
    284 }
    285 
    286 PassRefPtrWillBeRawPtr<ArchiveResource> MHTMLParser::parseNextPart(const MIMEHeader& mimeHeader, const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached)
    287 {
    288     ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty());
    289 
    290     // If no content transfer encoding is specified, default to binary encoding.
    291     MIMEHeader::Encoding contentTransferEncoding = mimeHeader.contentTransferEncoding();
    292     if (contentTransferEncoding == MIMEHeader::Unknown)
    293         contentTransferEncoding = MIMEHeader::Binary;
    294 
    295     RefPtr<SharedBuffer> content = SharedBuffer::create();
    296     const bool checkBoundary = !endOfPartBoundary.isEmpty();
    297     bool endOfPartReached = false;
    298     if (contentTransferEncoding == MIMEHeader::Binary) {
    299         if (!checkBoundary) {
    300             WTF_LOG_ERROR("Binary contents requires end of part");
    301             return nullptr;
    302         }
    303         m_lineReader.setSeparator(endOfPartBoundary.utf8().data());
    304         Vector<char> part;
    305         if (!m_lineReader.nextChunk(part)) {
    306             WTF_LOG_ERROR("Binary contents requires end of part");
    307             return nullptr;
    308         }
    309         content->append(part);
    310         m_lineReader.setSeparator("\r\n");
    311         Vector<char> nextChars;
    312         if (m_lineReader.peek(nextChars, 2) != 2) {
    313             WTF_LOG_ERROR("Invalid seperator.");
    314             return nullptr;
    315         }
    316         endOfPartReached = true;
    317         ASSERT(nextChars.size() == 2);
    318         endOfArchiveReached = (nextChars[0] == '-' && nextChars[1] == '-');
    319         if (!endOfArchiveReached) {
    320             String line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback();
    321             if (!line.isEmpty()) {
    322                 WTF_LOG_ERROR("No CRLF at end of binary section.");
    323                 return nullptr;
    324             }
    325         }
    326     } else {
    327         String line;
    328         while (!(line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
    329             endOfArchiveReached = (line == endOfDocumentBoundary);
    330             if (checkBoundary && (line == endOfPartBoundary || endOfArchiveReached)) {
    331                 endOfPartReached = true;
    332                 break;
    333             }
    334             // Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'.
    335             content->append(line.utf8().data(), line.length());
    336             if (contentTransferEncoding == MIMEHeader::QuotedPrintable) {
    337                 // The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines.
    338                 content->append("\r\n", 2);
    339             }
    340         }
    341     }
    342     if (!endOfPartReached && checkBoundary) {
    343         WTF_LOG_ERROR("No bounday found for MHTML part.");
    344         return nullptr;
    345     }
    346 
    347     Vector<char> data;
    348     switch (contentTransferEncoding) {
    349     case MIMEHeader::Base64:
    350         if (!base64Decode(content->data(), content->size(), data)) {
    351             WTF_LOG_ERROR("Invalid base64 content for MHTML part.");
    352             return nullptr;
    353         }
    354         break;
    355     case MIMEHeader::QuotedPrintable:
    356         quotedPrintableDecode(content->data(), content->size(), data);
    357         break;
    358     case MIMEHeader::EightBit:
    359     case MIMEHeader::SevenBit:
    360     case MIMEHeader::Binary:
    361         data.append(content->data(), content->size());
    362         break;
    363     default:
    364         WTF_LOG_ERROR("Invalid encoding for MHTML part.");
    365         return nullptr;
    366     }
    367     RefPtr<SharedBuffer> contentBuffer = SharedBuffer::adoptVector(data);
    368     // FIXME: the URL in the MIME header could be relative, we should resolve it if it is.
    369     // The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5
    370     // IE and Firefox (UNMht) seem to generate only absolute URLs.
    371     KURL location = KURL(KURL(), mimeHeader.contentLocation());
    372     return ArchiveResource::create(contentBuffer, location, AtomicString(mimeHeader.contentType()), AtomicString(mimeHeader.charset()), String());
    373 }
    374 
    375 size_t MHTMLParser::frameCount() const
    376 {
    377     return m_frames.size();
    378 }
    379 
    380 MHTMLArchive* MHTMLParser::frameAt(size_t index) const
    381 {
    382     return m_frames[index].get();
    383 }
    384 
    385 size_t MHTMLParser::subResourceCount() const
    386 {
    387     return m_resources.size();
    388 }
    389 
    390 ArchiveResource* MHTMLParser::subResourceAt(size_t index) const
    391 {
    392     return m_resources[index].get();
    393 }
    394 
    395 }
    396