Home | History | Annotate | Download | only in archive
      1 /*
      2  * Copyright (C) 2011 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *     * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *     * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *     * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 #include "config.h"
     32 #include "core/loader/archive/MHTMLParser.h"
     33 
     34 #include "core/loader/archive/MHTMLArchive.h"
     35 #include "core/platform/MIMETypeRegistry.h"
     36 #include "core/platform/network/MIMEHeader.h"
     37 #include "core/platform/text/QuotedPrintable.h"
     38 #include "wtf/text/Base64.h"
     39 
     40 namespace WebCore {
     41 
     42 static bool skipLinesUntilBoundaryFound(SharedBufferChunkReader& lineReader, const String& boundary)
     43 {
     44     String line;
     45     while (!(line = lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
     46         if (line == boundary)
     47             return true;
     48     }
     49     return false;
     50 }
     51 
     52 MHTMLParser::MHTMLParser(SharedBuffer* data)
     53     : m_lineReader(data, "\r\n")
     54 {
     55 }
     56 
     57 PassRefPtr<MHTMLArchive> MHTMLParser::parseArchive()
     58 {
     59     RefPtr<MIMEHeader> header = MIMEHeader::parseHeader(&m_lineReader);
     60     return parseArchiveWithHeader(header.get());
     61 }
     62 
     63 PassRefPtr<MHTMLArchive> MHTMLParser::parseArchiveWithHeader(MIMEHeader* header)
     64 {
     65     if (!header) {
     66         LOG_ERROR("Failed to parse MHTML part: no header.");
     67         return 0;
     68     }
     69 
     70     RefPtr<MHTMLArchive> archive = MHTMLArchive::create();
     71     if (!header->isMultipart()) {
     72         // With IE a page with no resource is not multi-part.
     73         bool endOfArchiveReached = false;
     74         RefPtr<ArchiveResource> resource = parseNextPart(*header, String(), String(), endOfArchiveReached);
     75         if (!resource)
     76             return 0;
     77         archive->setMainResource(resource);
     78         return archive;
     79     }
     80 
     81     // Skip the message content (it's a generic browser specific message).
     82     skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
     83 
     84     bool endOfArchive = false;
     85     while (!endOfArchive) {
     86         RefPtr<MIMEHeader> resourceHeader = MIMEHeader::parseHeader(&m_lineReader);
     87         if (!resourceHeader) {
     88             LOG_ERROR("Failed to parse MHTML, invalid MIME header.");
     89             return 0;
     90         }
     91         if (resourceHeader->contentType() == "multipart/alternative") {
     92             // Ignore IE nesting which makes little sense (IE seems to nest only some of the frames).
     93             RefPtr<MHTMLArchive> subframeArchive = parseArchiveWithHeader(resourceHeader.get());
     94             if (!subframeArchive) {
     95                 LOG_ERROR("Failed to parse MHTML subframe.");
     96                 return 0;
     97             }
     98             bool endOfPartReached = skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
     99             ASSERT_UNUSED(endOfPartReached, endOfPartReached);
    100             // The top-frame is the first frame found, regardless of the nesting level.
    101             if (subframeArchive->mainResource())
    102                 addResourceToArchive(subframeArchive->mainResource(), archive.get());
    103             archive->addSubframeArchive(subframeArchive);
    104             continue;
    105         }
    106 
    107         RefPtr<ArchiveResource> resource = parseNextPart(*resourceHeader, header->endOfPartBoundary(), header->endOfDocumentBoundary(), endOfArchive);
    108         if (!resource) {
    109             LOG_ERROR("Failed to parse MHTML part.");
    110             return 0;
    111         }
    112         addResourceToArchive(resource.get(), archive.get());
    113     }
    114 
    115     return archive.release();
    116 }
    117 
    118 void MHTMLParser::addResourceToArchive(ArchiveResource* resource, MHTMLArchive* archive)
    119 {
    120     const String& mimeType = resource->mimeType();
    121     if (!MIMETypeRegistry::isSupportedNonImageMIMEType(mimeType) || MIMETypeRegistry::isSupportedJavaScriptMIMEType(mimeType) || mimeType == "text/css") {
    122         m_resources.append(resource);
    123         return;
    124     }
    125 
    126     // The first document suitable resource is the main frame.
    127     if (!archive->mainResource()) {
    128         archive->setMainResource(resource);
    129         m_frames.append(archive);
    130         return;
    131     }
    132 
    133     RefPtr<MHTMLArchive> subframe = MHTMLArchive::create();
    134     subframe->setMainResource(resource);
    135     m_frames.append(subframe);
    136 }
    137 
    138 PassRefPtr<ArchiveResource> MHTMLParser::parseNextPart(const MIMEHeader& mimeHeader, const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached)
    139 {
    140     ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty());
    141 
    142     RefPtr<SharedBuffer> content = SharedBuffer::create();
    143     const bool checkBoundary = !endOfPartBoundary.isEmpty();
    144     bool endOfPartReached = false;
    145     if (mimeHeader.contentTransferEncoding() == MIMEHeader::Binary) {
    146         if (!checkBoundary) {
    147             LOG_ERROR("Binary contents requires end of part");
    148             return 0;
    149         }
    150         m_lineReader.setSeparator(endOfPartBoundary.utf8().data());
    151         Vector<char> part;
    152         if (!m_lineReader.nextChunk(part)) {
    153             LOG_ERROR("Binary contents requires end of part");
    154             return 0;
    155          }
    156          content->append(part);
    157          m_lineReader.setSeparator("\r\n");
    158          Vector<char> nextChars;
    159          if (m_lineReader.peek(nextChars, 2) != 2) {
    160              LOG_ERROR("Invalid seperator.");
    161              return 0;
    162          }
    163          endOfPartReached = true;
    164          ASSERT(nextChars.size() == 2);
    165          endOfArchiveReached = (nextChars[0] == '-' && nextChars[1] == '-');
    166          if (!endOfArchiveReached) {
    167              String line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback();
    168              if (!line.isEmpty()) {
    169                  LOG_ERROR("No CRLF at end of binary section.");
    170                  return 0;
    171              }
    172          }
    173     } else {
    174         String line;
    175         while (!(line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
    176             endOfArchiveReached = (line == endOfDocumentBoundary);
    177             if (checkBoundary && (line == endOfPartBoundary || endOfArchiveReached)) {
    178                 endOfPartReached = true;
    179                 break;
    180             }
    181             // Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'.
    182             content->append(line.utf8().data(), line.length());
    183             if (mimeHeader.contentTransferEncoding() == MIMEHeader::QuotedPrintable) {
    184                 // The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines.
    185                 content->append("\r\n", 2);
    186             }
    187         }
    188     }
    189     if (!endOfPartReached && checkBoundary) {
    190         LOG_ERROR("No bounday found for MHTML part.");
    191         return 0;
    192     }
    193 
    194     Vector<char> data;
    195     switch (mimeHeader.contentTransferEncoding()) {
    196     case MIMEHeader::Base64:
    197         if (!base64Decode(content->data(), content->size(), data)) {
    198             LOG_ERROR("Invalid base64 content for MHTML part.");
    199             return 0;
    200         }
    201         break;
    202     case MIMEHeader::QuotedPrintable:
    203         quotedPrintableDecode(content->data(), content->size(), data);
    204         break;
    205     case MIMEHeader::SevenBit:
    206     case MIMEHeader::Binary:
    207         data.append(content->data(), content->size());
    208         break;
    209     default:
    210         LOG_ERROR("Invalid encoding for MHTML part.");
    211         return 0;
    212     }
    213     RefPtr<SharedBuffer> contentBuffer = SharedBuffer::adoptVector(data);
    214     // FIXME: the URL in the MIME header could be relative, we should resolve it if it is.
    215     // The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5
    216     // IE and Firefox (UNMht) seem to generate only absolute URLs.
    217     KURL location = KURL(KURL(), mimeHeader.contentLocation());
    218     return ArchiveResource::create(contentBuffer, location, mimeHeader.contentType(), mimeHeader.charset(), String());
    219 }
    220 
    221 size_t MHTMLParser::frameCount() const
    222 {
    223     return m_frames.size();
    224 }
    225 
    226 MHTMLArchive* MHTMLParser::frameAt(size_t index) const
    227 {
    228     return m_frames[index].get();
    229 }
    230 
    231 size_t MHTMLParser::subResourceCount() const
    232 {
    233     return m_resources.size();
    234 }
    235 
    236 ArchiveResource* MHTMLParser::subResourceAt(size_t index) const
    237 {
    238     return m_resources[index].get();
    239 }
    240 
    241 }
    242