1 /* 2 * Copyright (C) 2011 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "config.h" 32 #include "platform/mhtml/MHTMLParser.h" 33 34 #include "platform/MIMETypeRegistry.h" 35 #include "platform/mhtml/MHTMLArchive.h" 36 #include "platform/network/ParsedContentType.h" 37 #include "platform/text/QuotedPrintable.h" 38 #include "wtf/HashMap.h" 39 #include "wtf/RefCounted.h" 40 #include "wtf/text/Base64.h" 41 #include "wtf/text/StringBuilder.h" 42 #include "wtf/text/StringConcatenate.h" 43 #include "wtf/text/StringHash.h" 44 #include "wtf/text/WTFString.h" 45 46 namespace WebCore { 47 48 // This class is a limited MIME parser used to parse the MIME headers of MHTML files. 49 class MIMEHeader : public RefCounted<MIMEHeader> { 50 public: 51 enum Encoding { 52 QuotedPrintable, 53 Base64, 54 EightBit, 55 SevenBit, 56 Binary, 57 Unknown 58 }; 59 60 static PassRefPtr<MIMEHeader> parseHeader(SharedBufferChunkReader* crLFLineReader); 61 62 bool isMultipart() const { return m_contentType.startsWith("multipart/"); } 63 64 String contentType() const { return m_contentType; } 65 String charset() const { return m_charset; } 66 Encoding contentTransferEncoding() const { return m_contentTransferEncoding; } 67 String contentLocation() const { return m_contentLocation; } 68 69 // Multi-part type and boundaries are only valid for multipart MIME headers. 70 String multiPartType() const { return m_multipartType; } 71 String endOfPartBoundary() const { return m_endOfPartBoundary; } 72 String endOfDocumentBoundary() const { return m_endOfDocumentBoundary; } 73 74 private: 75 MIMEHeader(); 76 77 static Encoding parseContentTransferEncoding(const String&); 78 79 String m_contentType; 80 String m_charset; 81 Encoding m_contentTransferEncoding; 82 String m_contentLocation; 83 String m_multipartType; 84 String m_endOfPartBoundary; 85 String m_endOfDocumentBoundary; 86 }; 87 88 typedef HashMap<String, String> KeyValueMap; 89 90 static KeyValueMap retrieveKeyValuePairs(WebCore::SharedBufferChunkReader* buffer) 91 { 92 KeyValueMap keyValuePairs; 93 String line; 94 String key; 95 StringBuilder value; 96 while (!(line = buffer->nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { 97 if (line.isEmpty()) 98 break; // Empty line means end of key/value section. 99 if (line[0] == '\t') { 100 ASSERT(!key.isEmpty()); 101 value.append(line.substring(1)); 102 continue; 103 } 104 // New key/value, store the previous one if any. 105 if (!key.isEmpty()) { 106 if (keyValuePairs.find(key) != keyValuePairs.end()) 107 WTF_LOG_ERROR("Key duplicate found in MIME header. Key is '%s', previous value replaced.", key.ascii().data()); 108 keyValuePairs.add(key, value.toString().stripWhiteSpace()); 109 key = String(); 110 value.clear(); 111 } 112 size_t semiColonIndex = line.find(':'); 113 if (semiColonIndex == kNotFound) { 114 // This is not a key value pair, ignore. 115 continue; 116 } 117 key = line.substring(0, semiColonIndex).lower().stripWhiteSpace(); 118 value.append(line.substring(semiColonIndex + 1)); 119 } 120 // Store the last property if there is one. 121 if (!key.isEmpty()) 122 keyValuePairs.set(key, value.toString().stripWhiteSpace()); 123 return keyValuePairs; 124 } 125 126 PassRefPtr<MIMEHeader> MIMEHeader::parseHeader(SharedBufferChunkReader* buffer) 127 { 128 RefPtr<MIMEHeader> mimeHeader = adoptRef(new MIMEHeader); 129 KeyValueMap keyValuePairs = retrieveKeyValuePairs(buffer); 130 KeyValueMap::iterator mimeParametersIterator = keyValuePairs.find("content-type"); 131 if (mimeParametersIterator != keyValuePairs.end()) { 132 ParsedContentType parsedContentType(mimeParametersIterator->value); 133 mimeHeader->m_contentType = parsedContentType.mimeType(); 134 if (!mimeHeader->isMultipart()) { 135 mimeHeader->m_charset = parsedContentType.charset().stripWhiteSpace(); 136 } else { 137 mimeHeader->m_multipartType = parsedContentType.parameterValueForName("type"); 138 mimeHeader->m_endOfPartBoundary = parsedContentType.parameterValueForName("boundary"); 139 if (mimeHeader->m_endOfPartBoundary.isNull()) { 140 WTF_LOG_ERROR("No boundary found in multipart MIME header."); 141 return 0; 142 } 143 mimeHeader->m_endOfPartBoundary.insert("--", 0); 144 mimeHeader->m_endOfDocumentBoundary = mimeHeader->m_endOfPartBoundary; 145 mimeHeader->m_endOfDocumentBoundary.append("--"); 146 } 147 } 148 149 mimeParametersIterator = keyValuePairs.find("content-transfer-encoding"); 150 if (mimeParametersIterator != keyValuePairs.end()) 151 mimeHeader->m_contentTransferEncoding = parseContentTransferEncoding(mimeParametersIterator->value); 152 153 mimeParametersIterator = keyValuePairs.find("content-location"); 154 if (mimeParametersIterator != keyValuePairs.end()) 155 mimeHeader->m_contentLocation = mimeParametersIterator->value; 156 157 return mimeHeader.release(); 158 } 159 160 MIMEHeader::Encoding MIMEHeader::parseContentTransferEncoding(const String& text) 161 { 162 String encoding = text.stripWhiteSpace().lower(); 163 if (encoding == "base64") 164 return Base64; 165 if (encoding == "quoted-printable") 166 return QuotedPrintable; 167 if (encoding == "8bit") 168 return EightBit; 169 if (encoding == "7bit") 170 return SevenBit; 171 if (encoding == "binary") 172 return Binary; 173 WTF_LOG_ERROR("Unknown encoding '%s' found in MIME header.", text.ascii().data()); 174 return Unknown; 175 } 176 177 MIMEHeader::MIMEHeader() 178 : m_contentTransferEncoding(Unknown) 179 { 180 } 181 182 static bool skipLinesUntilBoundaryFound(SharedBufferChunkReader& lineReader, const String& boundary) 183 { 184 String line; 185 while (!(line = lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { 186 if (line == boundary) 187 return true; 188 } 189 return false; 190 } 191 192 MHTMLParser::MHTMLParser(SharedBuffer* data) 193 : m_lineReader(data, "\r\n") 194 { 195 } 196 197 PassRefPtr<MHTMLArchive> MHTMLParser::parseArchive() 198 { 199 RefPtr<MIMEHeader> header = MIMEHeader::parseHeader(&m_lineReader); 200 return parseArchiveWithHeader(header.get()); 201 } 202 203 PassRefPtr<MHTMLArchive> MHTMLParser::parseArchiveWithHeader(MIMEHeader* header) 204 { 205 if (!header) { 206 WTF_LOG_ERROR("Failed to parse MHTML part: no header."); 207 return 0; 208 } 209 210 RefPtr<MHTMLArchive> archive = MHTMLArchive::create(); 211 if (!header->isMultipart()) { 212 // With IE a page with no resource is not multi-part. 213 bool endOfArchiveReached = false; 214 RefPtr<ArchiveResource> resource = parseNextPart(*header, String(), String(), endOfArchiveReached); 215 if (!resource) 216 return 0; 217 archive->setMainResource(resource); 218 return archive; 219 } 220 221 // Skip the message content (it's a generic browser specific message). 222 skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary()); 223 224 bool endOfArchive = false; 225 while (!endOfArchive) { 226 RefPtr<MIMEHeader> resourceHeader = MIMEHeader::parseHeader(&m_lineReader); 227 if (!resourceHeader) { 228 WTF_LOG_ERROR("Failed to parse MHTML, invalid MIME header."); 229 return 0; 230 } 231 if (resourceHeader->contentType() == "multipart/alternative") { 232 // Ignore IE nesting which makes little sense (IE seems to nest only some of the frames). 233 RefPtr<MHTMLArchive> subframeArchive = parseArchiveWithHeader(resourceHeader.get()); 234 if (!subframeArchive) { 235 WTF_LOG_ERROR("Failed to parse MHTML subframe."); 236 return 0; 237 } 238 bool endOfPartReached = skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary()); 239 ASSERT_UNUSED(endOfPartReached, endOfPartReached); 240 // The top-frame is the first frame found, regardless of the nesting level. 241 if (subframeArchive->mainResource()) 242 addResourceToArchive(subframeArchive->mainResource(), archive.get()); 243 archive->addSubframeArchive(subframeArchive); 244 continue; 245 } 246 247 RefPtr<ArchiveResource> resource = parseNextPart(*resourceHeader, header->endOfPartBoundary(), header->endOfDocumentBoundary(), endOfArchive); 248 if (!resource) { 249 WTF_LOG_ERROR("Failed to parse MHTML part."); 250 return 0; 251 } 252 addResourceToArchive(resource.get(), archive.get()); 253 } 254 255 return archive.release(); 256 } 257 258 void MHTMLParser::addResourceToArchive(ArchiveResource* resource, MHTMLArchive* archive) 259 { 260 const AtomicString& mimeType = resource->mimeType(); 261 if (!MIMETypeRegistry::isSupportedNonImageMIMEType(mimeType) || MIMETypeRegistry::isSupportedJavaScriptMIMEType(mimeType) || mimeType == "text/css") { 262 m_resources.append(resource); 263 return; 264 } 265 266 // The first document suitable resource is the main frame. 267 if (!archive->mainResource()) { 268 archive->setMainResource(resource); 269 m_frames.append(archive); 270 return; 271 } 272 273 RefPtr<MHTMLArchive> subframe = MHTMLArchive::create(); 274 subframe->setMainResource(resource); 275 m_frames.append(subframe); 276 } 277 278 PassRefPtr<ArchiveResource> MHTMLParser::parseNextPart(const MIMEHeader& mimeHeader, const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached) 279 { 280 ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty()); 281 282 // If no content transfer encoding is specified, default to binary encoding. 283 MIMEHeader::Encoding contentTransferEncoding = mimeHeader.contentTransferEncoding(); 284 if (contentTransferEncoding == MIMEHeader::Unknown) 285 contentTransferEncoding = MIMEHeader::Binary; 286 287 RefPtr<SharedBuffer> content = SharedBuffer::create(); 288 const bool checkBoundary = !endOfPartBoundary.isEmpty(); 289 bool endOfPartReached = false; 290 if (contentTransferEncoding == MIMEHeader::Binary) { 291 if (!checkBoundary) { 292 WTF_LOG_ERROR("Binary contents requires end of part"); 293 return 0; 294 } 295 m_lineReader.setSeparator(endOfPartBoundary.utf8().data()); 296 Vector<char> part; 297 if (!m_lineReader.nextChunk(part)) { 298 WTF_LOG_ERROR("Binary contents requires end of part"); 299 return 0; 300 } 301 content->append(part); 302 m_lineReader.setSeparator("\r\n"); 303 Vector<char> nextChars; 304 if (m_lineReader.peek(nextChars, 2) != 2) { 305 WTF_LOG_ERROR("Invalid seperator."); 306 return 0; 307 } 308 endOfPartReached = true; 309 ASSERT(nextChars.size() == 2); 310 endOfArchiveReached = (nextChars[0] == '-' && nextChars[1] == '-'); 311 if (!endOfArchiveReached) { 312 String line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback(); 313 if (!line.isEmpty()) { 314 WTF_LOG_ERROR("No CRLF at end of binary section."); 315 return 0; 316 } 317 } 318 } else { 319 String line; 320 while (!(line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { 321 endOfArchiveReached = (line == endOfDocumentBoundary); 322 if (checkBoundary && (line == endOfPartBoundary || endOfArchiveReached)) { 323 endOfPartReached = true; 324 break; 325 } 326 // Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'. 327 content->append(line.utf8().data(), line.length()); 328 if (contentTransferEncoding == MIMEHeader::QuotedPrintable) { 329 // The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines. 330 content->append("\r\n", 2); 331 } 332 } 333 } 334 if (!endOfPartReached && checkBoundary) { 335 WTF_LOG_ERROR("No bounday found for MHTML part."); 336 return 0; 337 } 338 339 Vector<char> data; 340 switch (contentTransferEncoding) { 341 case MIMEHeader::Base64: 342 if (!base64Decode(content->data(), content->size(), data)) { 343 WTF_LOG_ERROR("Invalid base64 content for MHTML part."); 344 return 0; 345 } 346 break; 347 case MIMEHeader::QuotedPrintable: 348 quotedPrintableDecode(content->data(), content->size(), data); 349 break; 350 case MIMEHeader::EightBit: 351 case MIMEHeader::SevenBit: 352 case MIMEHeader::Binary: 353 data.append(content->data(), content->size()); 354 break; 355 default: 356 WTF_LOG_ERROR("Invalid encoding for MHTML part."); 357 return 0; 358 } 359 RefPtr<SharedBuffer> contentBuffer = SharedBuffer::adoptVector(data); 360 // FIXME: the URL in the MIME header could be relative, we should resolve it if it is. 361 // The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5 362 // IE and Firefox (UNMht) seem to generate only absolute URLs. 363 KURL location = KURL(KURL(), mimeHeader.contentLocation()); 364 return ArchiveResource::create(contentBuffer, location, AtomicString(mimeHeader.contentType()), AtomicString(mimeHeader.charset()), String()); 365 } 366 367 size_t MHTMLParser::frameCount() const 368 { 369 return m_frames.size(); 370 } 371 372 MHTMLArchive* MHTMLParser::frameAt(size_t index) const 373 { 374 return m_frames[index].get(); 375 } 376 377 size_t MHTMLParser::subResourceCount() const 378 { 379 return m_resources.size(); 380 } 381 382 ArchiveResource* MHTMLParser::subResourceAt(size_t index) const 383 { 384 return m_resources[index].get(); 385 } 386 387 } 388