Home | History | Annotate | Download | only in mhtml
      1 /*
      2  * Copyright (C) 2011 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *     * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *     * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *     * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 #include "config.h"
     32 #include "platform/mhtml/MHTMLArchive.h"
     33 
     34 #include "platform/DateComponents.h"
     35 #include "platform/MIMETypeRegistry.h"
     36 #include "platform/SerializedResource.h"
     37 #include "platform/SharedBuffer.h"
     38 #include "platform/mhtml/ArchiveResource.h"
     39 #include "platform/mhtml/MHTMLParser.h"
     40 #include "platform/text/QuotedPrintable.h"
     41 #include "platform/weborigin/SchemeRegistry.h"
     42 #include "wtf/CryptographicallyRandomNumber.h"
     43 #include "wtf/DateMath.h"
     44 #include "wtf/text/Base64.h"
     45 #include "wtf/text/StringBuilder.h"
     46 
     47 namespace blink {
     48 
     49 const char* const quotedPrintable = "quoted-printable";
     50 const char* const base64 = "base64";
     51 const char* const binary = "binary";
     52 
     53 static String generateRandomBoundary()
     54 {
     55     // Trying to generate random boundaries similar to IE/UnMHT (ex: ----=_NextPart_000_001B_01CC157B.96F808A0).
     56     const size_t randomValuesLength = 10;
     57     char randomValues[randomValuesLength];
     58     cryptographicallyRandomValues(&randomValues, randomValuesLength);
     59     StringBuilder stringBuilder;
     60     stringBuilder.appendLiteral("----=_NextPart_000_");
     61     for (size_t i = 0; i < randomValuesLength; ++i) {
     62         if (i == 2)
     63             stringBuilder.append('_');
     64         else if (i == 6)
     65             stringBuilder.append('.');
     66         stringBuilder.append(lowerNibbleToASCIIHexDigit(randomValues[i]));
     67         stringBuilder.append(upperNibbleToASCIIHexDigit(randomValues[i]));
     68     }
     69     return stringBuilder.toString();
     70 }
     71 
     72 static String replaceNonPrintableCharacters(const String& text)
     73 {
     74     StringBuilder stringBuilder;
     75     for (size_t i = 0; i < text.length(); ++i) {
     76         if (isASCIIPrintable(text[i]))
     77             stringBuilder.append(text[i]);
     78         else
     79             stringBuilder.append('?');
     80     }
     81     return stringBuilder.toString();
     82 }
     83 
     84 MHTMLArchive::MHTMLArchive()
     85 {
     86 }
     87 
     88 MHTMLArchive::~MHTMLArchive()
     89 {
     90 #if !ENABLE(OILPAN)
     91     // Because all frames know about each other we need to perform a deep clearing of the archives graph.
     92     clearAllSubframeArchives();
     93 #endif
     94 }
     95 
     96 PassRefPtrWillBeRawPtr<MHTMLArchive> MHTMLArchive::create()
     97 {
     98     return adoptRefWillBeNoop(new MHTMLArchive);
     99 }
    100 
    101 PassRefPtrWillBeRawPtr<MHTMLArchive> MHTMLArchive::create(const KURL& url, SharedBuffer* data)
    102 {
    103     // For security reasons we only load MHTML pages from local URLs.
    104     if (!SchemeRegistry::shouldTreatURLSchemeAsLocal(url.protocol()))
    105         return nullptr;
    106 
    107     MHTMLParser parser(data);
    108     RefPtrWillBeRawPtr<MHTMLArchive> mainArchive = parser.parseArchive();
    109     if (!mainArchive)
    110         return nullptr; // Invalid MHTML file.
    111 
    112     // Since MHTML is a flat format, we need to make all frames aware of all resources.
    113     for (size_t i = 0; i < parser.frameCount(); ++i) {
    114         RefPtrWillBeRawPtr<MHTMLArchive> archive = parser.frameAt(i);
    115         for (size_t j = 1; j < parser.frameCount(); ++j) {
    116             if (i != j)
    117                 archive->addSubframeArchive(parser.frameAt(j));
    118         }
    119         for (size_t j = 0; j < parser.subResourceCount(); ++j)
    120             archive->addSubresource(parser.subResourceAt(j));
    121     }
    122     return mainArchive.release();
    123 }
    124 
    125 PassRefPtr<SharedBuffer> MHTMLArchive::generateMHTMLData(const Vector<SerializedResource>& resources, EncodingPolicy encodingPolicy, const String& title, const String& mimeType)
    126 {
    127     String boundary = generateRandomBoundary();
    128     String endOfResourceBoundary = "--" + boundary + "\r\n";
    129 
    130     DateComponents now;
    131     now.setMillisecondsSinceEpochForDateTime(currentTimeMS());
    132     String dateString = makeRFC2822DateString(now.weekDay(), now.monthDay(), now.month(), now.fullYear(), now.hour(), now.minute(), now.second(), 0);
    133 
    134     StringBuilder stringBuilder;
    135     stringBuilder.appendLiteral("From: <Saved by WebKit>\r\n");
    136     stringBuilder.appendLiteral("Subject: ");
    137     // We replace non ASCII characters with '?' characters to match IE's behavior.
    138     stringBuilder.append(replaceNonPrintableCharacters(title));
    139     stringBuilder.appendLiteral("\r\nDate: ");
    140     stringBuilder.append(dateString);
    141     stringBuilder.appendLiteral("\r\nMIME-Version: 1.0\r\n");
    142     stringBuilder.appendLiteral("Content-Type: multipart/related;\r\n");
    143     stringBuilder.appendLiteral("\ttype=\"");
    144     stringBuilder.append(mimeType);
    145     stringBuilder.appendLiteral("\";\r\n");
    146     stringBuilder.appendLiteral("\tboundary=\"");
    147     stringBuilder.append(boundary);
    148     stringBuilder.appendLiteral("\"\r\n\r\n");
    149 
    150     // We use utf8() below instead of ascii() as ascii() replaces CRLFs with ?? (we still only have put ASCII characters in it).
    151     ASSERT(stringBuilder.toString().containsOnlyASCII());
    152     CString asciiString = stringBuilder.toString().utf8();
    153     RefPtr<SharedBuffer> mhtmlData = SharedBuffer::create();
    154     mhtmlData->append(asciiString.data(), asciiString.length());
    155 
    156     for (size_t i = 0; i < resources.size(); ++i) {
    157         const SerializedResource& resource = resources[i];
    158 
    159         stringBuilder.clear();
    160         stringBuilder.append(endOfResourceBoundary);
    161         stringBuilder.appendLiteral("Content-Type: ");
    162         stringBuilder.append(resource.mimeType);
    163 
    164         const char* contentEncoding = 0;
    165         if (encodingPolicy == UseBinaryEncoding)
    166             contentEncoding = binary;
    167         else if (MIMETypeRegistry::isSupportedJavaScriptMIMEType(resource.mimeType) || MIMETypeRegistry::isSupportedNonImageMIMEType(resource.mimeType))
    168             contentEncoding = quotedPrintable;
    169         else
    170             contentEncoding = base64;
    171 
    172         stringBuilder.appendLiteral("\r\nContent-Transfer-Encoding: ");
    173         stringBuilder.append(contentEncoding);
    174         stringBuilder.appendLiteral("\r\nContent-Location: ");
    175         stringBuilder.append(resource.url);
    176         stringBuilder.appendLiteral("\r\n\r\n");
    177 
    178         asciiString = stringBuilder.toString().utf8();
    179         mhtmlData->append(asciiString.data(), asciiString.length());
    180 
    181         if (!strcmp(contentEncoding, binary)) {
    182             const char* data;
    183             size_t position = 0;
    184             while (size_t length = resource.data->getSomeData(data, position)) {
    185                 mhtmlData->append(data, length);
    186                 position += length;
    187             }
    188         } else {
    189             // FIXME: ideally we would encode the content as a stream without having to fetch it all.
    190             const char* data = resource.data->data();
    191             size_t dataLength = resource.data->size();
    192             Vector<char> encodedData;
    193             if (!strcmp(contentEncoding, quotedPrintable)) {
    194                 quotedPrintableEncode(data, dataLength, encodedData);
    195                 mhtmlData->append(encodedData.data(), encodedData.size());
    196                 mhtmlData->append("\r\n", 2);
    197             } else {
    198                 ASSERT(!strcmp(contentEncoding, base64));
    199                 // We are not specifying insertLFs = true below as it would cut the lines with LFs and MHTML requires CRLFs.
    200                 base64Encode(data, dataLength, encodedData);
    201                 const size_t maximumLineLength = 76;
    202                 size_t index = 0;
    203                 size_t encodedDataLength = encodedData.size();
    204                 do {
    205                     size_t lineLength = std::min(encodedDataLength - index, maximumLineLength);
    206                     mhtmlData->append(encodedData.data() + index, lineLength);
    207                     mhtmlData->append("\r\n", 2);
    208                     index += maximumLineLength;
    209                 } while (index < encodedDataLength);
    210             }
    211         }
    212     }
    213 
    214     asciiString = String("--" + boundary + "--\r\n").utf8();
    215     mhtmlData->append(asciiString.data(), asciiString.length());
    216 
    217     return mhtmlData.release();
    218 }
    219 
    220 #if !ENABLE(OILPAN)
    221 void MHTMLArchive::clearAllSubframeArchives()
    222 {
    223     SubFrameArchives clearedArchives;
    224     clearAllSubframeArchivesImpl(&clearedArchives);
    225 }
    226 
    227 void MHTMLArchive::clearAllSubframeArchivesImpl(SubFrameArchives* clearedArchives)
    228 {
    229     for (SubFrameArchives::iterator it = m_subframeArchives.begin(); it != m_subframeArchives.end(); ++it) {
    230         if (!clearedArchives->contains(*it)) {
    231             clearedArchives->append(*it);
    232             (*it)->clearAllSubframeArchivesImpl(clearedArchives);
    233         }
    234     }
    235     m_subframeArchives.clear();
    236 }
    237 #endif
    238 
    239 void MHTMLArchive::setMainResource(PassRefPtrWillBeRawPtr<ArchiveResource> mainResource)
    240 {
    241     m_mainResource = mainResource;
    242 }
    243 
    244 void MHTMLArchive::addSubresource(PassRefPtrWillBeRawPtr<ArchiveResource> subResource)
    245 {
    246     m_subresources.append(subResource);
    247 }
    248 
    249 void MHTMLArchive::addSubframeArchive(PassRefPtrWillBeRawPtr<MHTMLArchive> subframeArchive)
    250 {
    251     m_subframeArchives.append(subframeArchive);
    252 }
    253 
    254 void MHTMLArchive::trace(Visitor* visitor)
    255 {
    256     visitor->trace(m_mainResource);
    257     visitor->trace(m_subresources);
    258     visitor->trace(m_subframeArchives);
    259 }
    260 
    261 }
    262