1 /* 2 * Copyright (C) 2011 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "config.h" 32 #include "platform/mhtml/MHTMLParser.h" 33 34 #include "platform/MIMETypeRegistry.h" 35 #include "platform/mhtml/ArchiveResource.h" 36 #include "platform/mhtml/MHTMLArchive.h" 37 #include "platform/network/ParsedContentType.h" 38 #include "platform/text/QuotedPrintable.h" 39 #include "wtf/HashMap.h" 40 #include "wtf/RefCounted.h" 41 #include "wtf/text/Base64.h" 42 #include "wtf/text/StringBuilder.h" 43 #include "wtf/text/StringConcatenate.h" 44 #include "wtf/text/StringHash.h" 45 #include "wtf/text/WTFString.h" 46 47 namespace blink { 48 49 // This class is a limited MIME parser used to parse the MIME headers of MHTML files. 50 class MIMEHeader : public RefCountedWillBeGarbageCollectedFinalized<MIMEHeader> { 51 public: 52 static PassRefPtrWillBeRawPtr<MIMEHeader> create() 53 { 54 return adoptRefWillBeNoop(new MIMEHeader()); 55 } 56 57 enum Encoding { 58 QuotedPrintable, 59 Base64, 60 EightBit, 61 SevenBit, 62 Binary, 63 Unknown 64 }; 65 66 static PassRefPtrWillBeRawPtr<MIMEHeader> parseHeader(SharedBufferChunkReader* crLFLineReader); 67 68 bool isMultipart() const { return m_contentType.startsWith("multipart/"); } 69 70 String contentType() const { return m_contentType; } 71 String charset() const { return m_charset; } 72 Encoding contentTransferEncoding() const { return m_contentTransferEncoding; } 73 String contentLocation() const { return m_contentLocation; } 74 75 // Multi-part type and boundaries are only valid for multipart MIME headers. 76 String multiPartType() const { return m_multipartType; } 77 String endOfPartBoundary() const { return m_endOfPartBoundary; } 78 String endOfDocumentBoundary() const { return m_endOfDocumentBoundary; } 79 80 void trace(Visitor*) { } 81 82 private: 83 MIMEHeader(); 84 85 static Encoding parseContentTransferEncoding(const String&); 86 87 String m_contentType; 88 String m_charset; 89 Encoding m_contentTransferEncoding; 90 String m_contentLocation; 91 String m_multipartType; 92 String m_endOfPartBoundary; 93 String m_endOfDocumentBoundary; 94 }; 95 96 typedef HashMap<String, String> KeyValueMap; 97 98 static KeyValueMap retrieveKeyValuePairs(blink::SharedBufferChunkReader* buffer) 99 { 100 KeyValueMap keyValuePairs; 101 String line; 102 String key; 103 StringBuilder value; 104 while (!(line = buffer->nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { 105 if (line.isEmpty()) 106 break; // Empty line means end of key/value section. 107 if (line[0] == '\t') { 108 ASSERT(!key.isEmpty()); 109 value.append(line.substring(1)); 110 continue; 111 } 112 // New key/value, store the previous one if any. 113 if (!key.isEmpty()) { 114 if (keyValuePairs.find(key) != keyValuePairs.end()) 115 WTF_LOG_ERROR("Key duplicate found in MIME header. Key is '%s', previous value replaced.", key.ascii().data()); 116 keyValuePairs.add(key, value.toString().stripWhiteSpace()); 117 key = String(); 118 value.clear(); 119 } 120 size_t semiColonIndex = line.find(':'); 121 if (semiColonIndex == kNotFound) { 122 // This is not a key value pair, ignore. 123 continue; 124 } 125 key = line.substring(0, semiColonIndex).lower().stripWhiteSpace(); 126 value.append(line.substring(semiColonIndex + 1)); 127 } 128 // Store the last property if there is one. 129 if (!key.isEmpty()) 130 keyValuePairs.set(key, value.toString().stripWhiteSpace()); 131 return keyValuePairs; 132 } 133 134 PassRefPtrWillBeRawPtr<MIMEHeader> MIMEHeader::parseHeader(SharedBufferChunkReader* buffer) 135 { 136 RefPtrWillBeRawPtr<MIMEHeader> mimeHeader = MIMEHeader::create(); 137 KeyValueMap keyValuePairs = retrieveKeyValuePairs(buffer); 138 KeyValueMap::iterator mimeParametersIterator = keyValuePairs.find("content-type"); 139 if (mimeParametersIterator != keyValuePairs.end()) { 140 ParsedContentType parsedContentType(mimeParametersIterator->value); 141 mimeHeader->m_contentType = parsedContentType.mimeType(); 142 if (!mimeHeader->isMultipart()) { 143 mimeHeader->m_charset = parsedContentType.charset().stripWhiteSpace(); 144 } else { 145 mimeHeader->m_multipartType = parsedContentType.parameterValueForName("type"); 146 mimeHeader->m_endOfPartBoundary = parsedContentType.parameterValueForName("boundary"); 147 if (mimeHeader->m_endOfPartBoundary.isNull()) { 148 WTF_LOG_ERROR("No boundary found in multipart MIME header."); 149 return nullptr; 150 } 151 mimeHeader->m_endOfPartBoundary.insert("--", 0); 152 mimeHeader->m_endOfDocumentBoundary = mimeHeader->m_endOfPartBoundary; 153 mimeHeader->m_endOfDocumentBoundary.append("--"); 154 } 155 } 156 157 mimeParametersIterator = keyValuePairs.find("content-transfer-encoding"); 158 if (mimeParametersIterator != keyValuePairs.end()) 159 mimeHeader->m_contentTransferEncoding = parseContentTransferEncoding(mimeParametersIterator->value); 160 161 mimeParametersIterator = keyValuePairs.find("content-location"); 162 if (mimeParametersIterator != keyValuePairs.end()) 163 mimeHeader->m_contentLocation = mimeParametersIterator->value; 164 165 return mimeHeader.release(); 166 } 167 168 MIMEHeader::Encoding MIMEHeader::parseContentTransferEncoding(const String& text) 169 { 170 String encoding = text.stripWhiteSpace().lower(); 171 if (encoding == "base64") 172 return Base64; 173 if (encoding == "quoted-printable") 174 return QuotedPrintable; 175 if (encoding == "8bit") 176 return EightBit; 177 if (encoding == "7bit") 178 return SevenBit; 179 if (encoding == "binary") 180 return Binary; 181 WTF_LOG_ERROR("Unknown encoding '%s' found in MIME header.", text.ascii().data()); 182 return Unknown; 183 } 184 185 MIMEHeader::MIMEHeader() 186 : m_contentTransferEncoding(Unknown) 187 { 188 } 189 190 static bool skipLinesUntilBoundaryFound(SharedBufferChunkReader& lineReader, const String& boundary) 191 { 192 String line; 193 while (!(line = lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { 194 if (line == boundary) 195 return true; 196 } 197 return false; 198 } 199 200 MHTMLParser::MHTMLParser(SharedBuffer* data) 201 : m_lineReader(data, "\r\n") 202 { 203 } 204 205 PassRefPtrWillBeRawPtr<MHTMLArchive> MHTMLParser::parseArchive() 206 { 207 RefPtrWillBeRawPtr<MIMEHeader> header = MIMEHeader::parseHeader(&m_lineReader); 208 return parseArchiveWithHeader(header.get()); 209 } 210 211 PassRefPtrWillBeRawPtr<MHTMLArchive> MHTMLParser::parseArchiveWithHeader(MIMEHeader* header) 212 { 213 if (!header) { 214 WTF_LOG_ERROR("Failed to parse MHTML part: no header."); 215 return nullptr; 216 } 217 218 RefPtrWillBeRawPtr<MHTMLArchive> archive = MHTMLArchive::create(); 219 if (!header->isMultipart()) { 220 // With IE a page with no resource is not multi-part. 221 bool endOfArchiveReached = false; 222 RefPtrWillBeRawPtr<ArchiveResource> resource = parseNextPart(*header, String(), String(), endOfArchiveReached); 223 if (!resource) 224 return nullptr; 225 archive->setMainResource(resource); 226 return archive; 227 } 228 229 // Skip the message content (it's a generic browser specific message). 230 skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary()); 231 232 bool endOfArchive = false; 233 while (!endOfArchive) { 234 RefPtrWillBeRawPtr<MIMEHeader> resourceHeader = MIMEHeader::parseHeader(&m_lineReader); 235 if (!resourceHeader) { 236 WTF_LOG_ERROR("Failed to parse MHTML, invalid MIME header."); 237 return nullptr; 238 } 239 if (resourceHeader->contentType() == "multipart/alternative") { 240 // Ignore IE nesting which makes little sense (IE seems to nest only some of the frames). 241 RefPtrWillBeRawPtr<MHTMLArchive> subframeArchive = parseArchiveWithHeader(resourceHeader.get()); 242 if (!subframeArchive) { 243 WTF_LOG_ERROR("Failed to parse MHTML subframe."); 244 return nullptr; 245 } 246 bool endOfPartReached = skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary()); 247 ASSERT_UNUSED(endOfPartReached, endOfPartReached); 248 // The top-frame is the first frame found, regardless of the nesting level. 249 if (subframeArchive->mainResource()) 250 addResourceToArchive(subframeArchive->mainResource(), archive.get()); 251 archive->addSubframeArchive(subframeArchive); 252 continue; 253 } 254 255 RefPtrWillBeRawPtr<ArchiveResource> resource = parseNextPart(*resourceHeader, header->endOfPartBoundary(), header->endOfDocumentBoundary(), endOfArchive); 256 if (!resource) { 257 WTF_LOG_ERROR("Failed to parse MHTML part."); 258 return nullptr; 259 } 260 addResourceToArchive(resource.get(), archive.get()); 261 } 262 263 return archive.release(); 264 } 265 266 void MHTMLParser::addResourceToArchive(ArchiveResource* resource, MHTMLArchive* archive) 267 { 268 const AtomicString& mimeType = resource->mimeType(); 269 if (!MIMETypeRegistry::isSupportedNonImageMIMEType(mimeType) || MIMETypeRegistry::isSupportedJavaScriptMIMEType(mimeType) || mimeType == "text/css") { 270 m_resources.append(resource); 271 return; 272 } 273 274 // The first document suitable resource is the main frame. 275 if (!archive->mainResource()) { 276 archive->setMainResource(resource); 277 m_frames.append(archive); 278 return; 279 } 280 281 RefPtrWillBeRawPtr<MHTMLArchive> subframe = MHTMLArchive::create(); 282 subframe->setMainResource(resource); 283 m_frames.append(subframe); 284 } 285 286 PassRefPtrWillBeRawPtr<ArchiveResource> MHTMLParser::parseNextPart(const MIMEHeader& mimeHeader, const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached) 287 { 288 ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty()); 289 290 // If no content transfer encoding is specified, default to binary encoding. 291 MIMEHeader::Encoding contentTransferEncoding = mimeHeader.contentTransferEncoding(); 292 if (contentTransferEncoding == MIMEHeader::Unknown) 293 contentTransferEncoding = MIMEHeader::Binary; 294 295 RefPtr<SharedBuffer> content = SharedBuffer::create(); 296 const bool checkBoundary = !endOfPartBoundary.isEmpty(); 297 bool endOfPartReached = false; 298 if (contentTransferEncoding == MIMEHeader::Binary) { 299 if (!checkBoundary) { 300 WTF_LOG_ERROR("Binary contents requires end of part"); 301 return nullptr; 302 } 303 m_lineReader.setSeparator(endOfPartBoundary.utf8().data()); 304 Vector<char> part; 305 if (!m_lineReader.nextChunk(part)) { 306 WTF_LOG_ERROR("Binary contents requires end of part"); 307 return nullptr; 308 } 309 content->append(part); 310 m_lineReader.setSeparator("\r\n"); 311 Vector<char> nextChars; 312 if (m_lineReader.peek(nextChars, 2) != 2) { 313 WTF_LOG_ERROR("Invalid seperator."); 314 return nullptr; 315 } 316 endOfPartReached = true; 317 ASSERT(nextChars.size() == 2); 318 endOfArchiveReached = (nextChars[0] == '-' && nextChars[1] == '-'); 319 if (!endOfArchiveReached) { 320 String line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback(); 321 if (!line.isEmpty()) { 322 WTF_LOG_ERROR("No CRLF at end of binary section."); 323 return nullptr; 324 } 325 } 326 } else { 327 String line; 328 while (!(line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { 329 endOfArchiveReached = (line == endOfDocumentBoundary); 330 if (checkBoundary && (line == endOfPartBoundary || endOfArchiveReached)) { 331 endOfPartReached = true; 332 break; 333 } 334 // Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'. 335 content->append(line.utf8().data(), line.length()); 336 if (contentTransferEncoding == MIMEHeader::QuotedPrintable) { 337 // The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines. 338 content->append("\r\n", 2); 339 } 340 } 341 } 342 if (!endOfPartReached && checkBoundary) { 343 WTF_LOG_ERROR("No bounday found for MHTML part."); 344 return nullptr; 345 } 346 347 Vector<char> data; 348 switch (contentTransferEncoding) { 349 case MIMEHeader::Base64: 350 if (!base64Decode(content->data(), content->size(), data)) { 351 WTF_LOG_ERROR("Invalid base64 content for MHTML part."); 352 return nullptr; 353 } 354 break; 355 case MIMEHeader::QuotedPrintable: 356 quotedPrintableDecode(content->data(), content->size(), data); 357 break; 358 case MIMEHeader::EightBit: 359 case MIMEHeader::SevenBit: 360 case MIMEHeader::Binary: 361 data.append(content->data(), content->size()); 362 break; 363 default: 364 WTF_LOG_ERROR("Invalid encoding for MHTML part."); 365 return nullptr; 366 } 367 RefPtr<SharedBuffer> contentBuffer = SharedBuffer::adoptVector(data); 368 // FIXME: the URL in the MIME header could be relative, we should resolve it if it is. 369 // The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5 370 // IE and Firefox (UNMht) seem to generate only absolute URLs. 371 KURL location = KURL(KURL(), mimeHeader.contentLocation()); 372 return ArchiveResource::create(contentBuffer, location, AtomicString(mimeHeader.contentType()), AtomicString(mimeHeader.charset()), String()); 373 } 374 375 size_t MHTMLParser::frameCount() const 376 { 377 return m_frames.size(); 378 } 379 380 MHTMLArchive* MHTMLParser::frameAt(size_t index) const 381 { 382 return m_frames[index].get(); 383 } 384 385 size_t MHTMLParser::subResourceCount() const 386 { 387 return m_resources.size(); 388 } 389 390 ArchiveResource* MHTMLParser::subResourceAt(size_t index) const 391 { 392 return m_resources[index].get(); 393 } 394 395 } 396