1 /* 2 * Copyright (C) 2011 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "config.h" 32 #include "platform/mhtml/MHTMLArchive.h" 33 34 #include "platform/MIMETypeRegistry.h" 35 #include "platform/SerializedResource.h" 36 #include "platform/SharedBuffer.h" 37 #include "platform/mhtml/MHTMLParser.h" 38 #include "platform/text/QuotedPrintable.h" 39 #include "platform/weborigin/SchemeRegistry.h" 40 #include "wtf/CryptographicallyRandomNumber.h" 41 #include "wtf/DateMath.h" 42 #include "wtf/GregorianDateTime.h" 43 #include "wtf/text/Base64.h" 44 #include "wtf/text/StringBuilder.h" 45 46 namespace WebCore { 47 48 const char* const quotedPrintable = "quoted-printable"; 49 const char* const base64 = "base64"; 50 const char* const binary = "binary"; 51 52 static String generateRandomBoundary() 53 { 54 // Trying to generate random boundaries similar to IE/UnMHT (ex: ----=_NextPart_000_001B_01CC157B.96F808A0). 55 const size_t randomValuesLength = 10; 56 char randomValues[randomValuesLength]; 57 cryptographicallyRandomValues(&randomValues, randomValuesLength); 58 StringBuilder stringBuilder; 59 stringBuilder.append("----=_NextPart_000_"); 60 for (size_t i = 0; i < randomValuesLength; ++i) { 61 if (i == 2) 62 stringBuilder.append('_'); 63 else if (i == 6) 64 stringBuilder.append('.'); 65 stringBuilder.append(lowerNibbleToASCIIHexDigit(randomValues[i])); 66 stringBuilder.append(upperNibbleToASCIIHexDigit(randomValues[i])); 67 } 68 return stringBuilder.toString(); 69 } 70 71 static String replaceNonPrintableCharacters(const String& text) 72 { 73 StringBuilder stringBuilder; 74 for (size_t i = 0; i < text.length(); ++i) { 75 if (isASCIIPrintable(text[i])) 76 stringBuilder.append(text[i]); 77 else 78 stringBuilder.append('?'); 79 } 80 return stringBuilder.toString(); 81 } 82 83 MHTMLArchive::MHTMLArchive() 84 { 85 } 86 87 MHTMLArchive::~MHTMLArchive() 88 { 89 // Because all frames know about each other we need to perform a deep clearing of the archives graph. 90 clearAllSubframeArchives(); 91 } 92 93 PassRefPtr<MHTMLArchive> MHTMLArchive::create() 94 { 95 return adoptRef(new MHTMLArchive); 96 } 97 98 PassRefPtr<MHTMLArchive> MHTMLArchive::create(const KURL& url, SharedBuffer* data) 99 { 100 // For security reasons we only load MHTML pages from local URLs. 101 if (!SchemeRegistry::shouldTreatURLSchemeAsLocal(url.protocol())) 102 return 0; 103 104 MHTMLParser parser(data); 105 RefPtr<MHTMLArchive> mainArchive = parser.parseArchive(); 106 if (!mainArchive) 107 return 0; // Invalid MHTML file. 108 109 // Since MHTML is a flat format, we need to make all frames aware of all resources. 110 for (size_t i = 0; i < parser.frameCount(); ++i) { 111 RefPtr<MHTMLArchive> archive = parser.frameAt(i); 112 for (size_t j = 1; j < parser.frameCount(); ++j) { 113 if (i != j) 114 archive->addSubframeArchive(parser.frameAt(j)); 115 } 116 for (size_t j = 0; j < parser.subResourceCount(); ++j) 117 archive->addSubresource(parser.subResourceAt(j)); 118 } 119 return mainArchive.release(); 120 } 121 122 PassRefPtr<SharedBuffer> MHTMLArchive::generateMHTMLData(const Vector<SerializedResource>& resources, EncodingPolicy encodingPolicy, const String& title, const String& mimeType) 123 { 124 String boundary = generateRandomBoundary(); 125 String endOfResourceBoundary = "--" + boundary + "\r\n"; 126 127 GregorianDateTime now; 128 now.setToCurrentLocalTime(); 129 String dateString = makeRFC2822DateString(now.weekDay(), now.monthDay(), now.month(), now.year(), now.hour(), now.minute(), now.second(), now.utcOffset() / 60); 130 131 StringBuilder stringBuilder; 132 stringBuilder.append("From: <Saved by WebKit>\r\n"); 133 stringBuilder.append("Subject: "); 134 // We replace non ASCII characters with '?' characters to match IE's behavior. 135 stringBuilder.append(replaceNonPrintableCharacters(title)); 136 stringBuilder.append("\r\nDate: "); 137 stringBuilder.append(dateString); 138 stringBuilder.append("\r\nMIME-Version: 1.0\r\n"); 139 stringBuilder.append("Content-Type: multipart/related;\r\n"); 140 stringBuilder.append("\ttype=\""); 141 stringBuilder.append(mimeType); 142 stringBuilder.append("\";\r\n"); 143 stringBuilder.append("\tboundary=\""); 144 stringBuilder.append(boundary); 145 stringBuilder.append("\"\r\n\r\n"); 146 147 // We use utf8() below instead of ascii() as ascii() replaces CRLFs with ?? (we still only have put ASCII characters in it). 148 ASSERT(stringBuilder.toString().containsOnlyASCII()); 149 CString asciiString = stringBuilder.toString().utf8(); 150 RefPtr<SharedBuffer> mhtmlData = SharedBuffer::create(); 151 mhtmlData->append(asciiString.data(), asciiString.length()); 152 153 for (size_t i = 0; i < resources.size(); ++i) { 154 const SerializedResource& resource = resources[i]; 155 156 stringBuilder.clear(); 157 stringBuilder.append(endOfResourceBoundary); 158 stringBuilder.append("Content-Type: "); 159 stringBuilder.append(resource.mimeType); 160 161 const char* contentEncoding = 0; 162 if (encodingPolicy == UseBinaryEncoding) 163 contentEncoding = binary; 164 else if (MIMETypeRegistry::isSupportedJavaScriptMIMEType(resource.mimeType) || MIMETypeRegistry::isSupportedNonImageMIMEType(resource.mimeType)) 165 contentEncoding = quotedPrintable; 166 else 167 contentEncoding = base64; 168 169 stringBuilder.append("\r\nContent-Transfer-Encoding: "); 170 stringBuilder.append(contentEncoding); 171 stringBuilder.append("\r\nContent-Location: "); 172 stringBuilder.append(resource.url); 173 stringBuilder.append("\r\n\r\n"); 174 175 asciiString = stringBuilder.toString().utf8(); 176 mhtmlData->append(asciiString.data(), asciiString.length()); 177 178 if (!strcmp(contentEncoding, binary)) { 179 const char* data; 180 size_t position = 0; 181 while (size_t length = resource.data->getSomeData(data, position)) { 182 mhtmlData->append(data, length); 183 position += length; 184 } 185 } else { 186 // FIXME: ideally we would encode the content as a stream without having to fetch it all. 187 const char* data = resource.data->data(); 188 size_t dataLength = resource.data->size(); 189 Vector<char> encodedData; 190 if (!strcmp(contentEncoding, quotedPrintable)) { 191 quotedPrintableEncode(data, dataLength, encodedData); 192 mhtmlData->append(encodedData.data(), encodedData.size()); 193 mhtmlData->append("\r\n", 2); 194 } else { 195 ASSERT(!strcmp(contentEncoding, base64)); 196 // We are not specifying insertLFs = true below as it would cut the lines with LFs and MHTML requires CRLFs. 197 base64Encode(data, dataLength, encodedData); 198 const size_t maximumLineLength = 76; 199 size_t index = 0; 200 size_t encodedDataLength = encodedData.size(); 201 do { 202 size_t lineLength = std::min(encodedDataLength - index, maximumLineLength); 203 mhtmlData->append(encodedData.data() + index, lineLength); 204 mhtmlData->append("\r\n", 2); 205 index += maximumLineLength; 206 } while (index < encodedDataLength); 207 } 208 } 209 } 210 211 asciiString = String("--" + boundary + "--\r\n").utf8(); 212 mhtmlData->append(asciiString.data(), asciiString.length()); 213 214 return mhtmlData.release(); 215 } 216 217 void MHTMLArchive::clearAllSubframeArchives() 218 { 219 Vector<RefPtr<MHTMLArchive> > clearedArchives; 220 clearAllSubframeArchivesImpl(&clearedArchives); 221 } 222 223 void MHTMLArchive::clearAllSubframeArchivesImpl(Vector<RefPtr<MHTMLArchive> >* clearedArchives) 224 { 225 for (Vector<RefPtr<MHTMLArchive> >::iterator it = m_subframeArchives.begin(); it != m_subframeArchives.end(); ++it) { 226 if (!clearedArchives->contains(*it)) { 227 clearedArchives->append(*it); 228 (*it)->clearAllSubframeArchivesImpl(clearedArchives); 229 } 230 } 231 m_subframeArchives.clear(); 232 } 233 234 } 235