1 /* 2 * Copyright (C) 2011 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "config.h" 32 #include "core/loader/archive/MHTMLParser.h" 33 34 #include "core/loader/archive/MHTMLArchive.h" 35 #include "core/platform/MIMETypeRegistry.h" 36 #include "core/platform/network/MIMEHeader.h" 37 #include "core/platform/text/QuotedPrintable.h" 38 #include "wtf/text/Base64.h" 39 40 namespace WebCore { 41 42 static bool skipLinesUntilBoundaryFound(SharedBufferChunkReader& lineReader, const String& boundary) 43 { 44 String line; 45 while (!(line = lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { 46 if (line == boundary) 47 return true; 48 } 49 return false; 50 } 51 52 MHTMLParser::MHTMLParser(SharedBuffer* data) 53 : m_lineReader(data, "\r\n") 54 { 55 } 56 57 PassRefPtr<MHTMLArchive> MHTMLParser::parseArchive() 58 { 59 RefPtr<MIMEHeader> header = MIMEHeader::parseHeader(&m_lineReader); 60 return parseArchiveWithHeader(header.get()); 61 } 62 63 PassRefPtr<MHTMLArchive> MHTMLParser::parseArchiveWithHeader(MIMEHeader* header) 64 { 65 if (!header) { 66 LOG_ERROR("Failed to parse MHTML part: no header."); 67 return 0; 68 } 69 70 RefPtr<MHTMLArchive> archive = MHTMLArchive::create(); 71 if (!header->isMultipart()) { 72 // With IE a page with no resource is not multi-part. 73 bool endOfArchiveReached = false; 74 RefPtr<ArchiveResource> resource = parseNextPart(*header, String(), String(), endOfArchiveReached); 75 if (!resource) 76 return 0; 77 archive->setMainResource(resource); 78 return archive; 79 } 80 81 // Skip the message content (it's a generic browser specific message). 82 skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary()); 83 84 bool endOfArchive = false; 85 while (!endOfArchive) { 86 RefPtr<MIMEHeader> resourceHeader = MIMEHeader::parseHeader(&m_lineReader); 87 if (!resourceHeader) { 88 LOG_ERROR("Failed to parse MHTML, invalid MIME header."); 89 return 0; 90 } 91 if (resourceHeader->contentType() == "multipart/alternative") { 92 // Ignore IE nesting which makes little sense (IE seems to nest only some of the frames). 93 RefPtr<MHTMLArchive> subframeArchive = parseArchiveWithHeader(resourceHeader.get()); 94 if (!subframeArchive) { 95 LOG_ERROR("Failed to parse MHTML subframe."); 96 return 0; 97 } 98 bool endOfPartReached = skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary()); 99 ASSERT_UNUSED(endOfPartReached, endOfPartReached); 100 // The top-frame is the first frame found, regardless of the nesting level. 101 if (subframeArchive->mainResource()) 102 addResourceToArchive(subframeArchive->mainResource(), archive.get()); 103 archive->addSubframeArchive(subframeArchive); 104 continue; 105 } 106 107 RefPtr<ArchiveResource> resource = parseNextPart(*resourceHeader, header->endOfPartBoundary(), header->endOfDocumentBoundary(), endOfArchive); 108 if (!resource) { 109 LOG_ERROR("Failed to parse MHTML part."); 110 return 0; 111 } 112 addResourceToArchive(resource.get(), archive.get()); 113 } 114 115 return archive.release(); 116 } 117 118 void MHTMLParser::addResourceToArchive(ArchiveResource* resource, MHTMLArchive* archive) 119 { 120 const String& mimeType = resource->mimeType(); 121 if (!MIMETypeRegistry::isSupportedNonImageMIMEType(mimeType) || MIMETypeRegistry::isSupportedJavaScriptMIMEType(mimeType) || mimeType == "text/css") { 122 m_resources.append(resource); 123 return; 124 } 125 126 // The first document suitable resource is the main frame. 127 if (!archive->mainResource()) { 128 archive->setMainResource(resource); 129 m_frames.append(archive); 130 return; 131 } 132 133 RefPtr<MHTMLArchive> subframe = MHTMLArchive::create(); 134 subframe->setMainResource(resource); 135 m_frames.append(subframe); 136 } 137 138 PassRefPtr<ArchiveResource> MHTMLParser::parseNextPart(const MIMEHeader& mimeHeader, const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached) 139 { 140 ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty()); 141 142 RefPtr<SharedBuffer> content = SharedBuffer::create(); 143 const bool checkBoundary = !endOfPartBoundary.isEmpty(); 144 bool endOfPartReached = false; 145 if (mimeHeader.contentTransferEncoding() == MIMEHeader::Binary) { 146 if (!checkBoundary) { 147 LOG_ERROR("Binary contents requires end of part"); 148 return 0; 149 } 150 m_lineReader.setSeparator(endOfPartBoundary.utf8().data()); 151 Vector<char> part; 152 if (!m_lineReader.nextChunk(part)) { 153 LOG_ERROR("Binary contents requires end of part"); 154 return 0; 155 } 156 content->append(part); 157 m_lineReader.setSeparator("\r\n"); 158 Vector<char> nextChars; 159 if (m_lineReader.peek(nextChars, 2) != 2) { 160 LOG_ERROR("Invalid seperator."); 161 return 0; 162 } 163 endOfPartReached = true; 164 ASSERT(nextChars.size() == 2); 165 endOfArchiveReached = (nextChars[0] == '-' && nextChars[1] == '-'); 166 if (!endOfArchiveReached) { 167 String line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback(); 168 if (!line.isEmpty()) { 169 LOG_ERROR("No CRLF at end of binary section."); 170 return 0; 171 } 172 } 173 } else { 174 String line; 175 while (!(line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { 176 endOfArchiveReached = (line == endOfDocumentBoundary); 177 if (checkBoundary && (line == endOfPartBoundary || endOfArchiveReached)) { 178 endOfPartReached = true; 179 break; 180 } 181 // Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'. 182 content->append(line.utf8().data(), line.length()); 183 if (mimeHeader.contentTransferEncoding() == MIMEHeader::QuotedPrintable) { 184 // The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines. 185 content->append("\r\n", 2); 186 } 187 } 188 } 189 if (!endOfPartReached && checkBoundary) { 190 LOG_ERROR("No bounday found for MHTML part."); 191 return 0; 192 } 193 194 Vector<char> data; 195 switch (mimeHeader.contentTransferEncoding()) { 196 case MIMEHeader::Base64: 197 if (!base64Decode(content->data(), content->size(), data)) { 198 LOG_ERROR("Invalid base64 content for MHTML part."); 199 return 0; 200 } 201 break; 202 case MIMEHeader::QuotedPrintable: 203 quotedPrintableDecode(content->data(), content->size(), data); 204 break; 205 case MIMEHeader::SevenBit: 206 case MIMEHeader::Binary: 207 data.append(content->data(), content->size()); 208 break; 209 default: 210 LOG_ERROR("Invalid encoding for MHTML part."); 211 return 0; 212 } 213 RefPtr<SharedBuffer> contentBuffer = SharedBuffer::adoptVector(data); 214 // FIXME: the URL in the MIME header could be relative, we should resolve it if it is. 215 // The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5 216 // IE and Firefox (UNMht) seem to generate only absolute URLs. 217 KURL location = KURL(KURL(), mimeHeader.contentLocation()); 218 return ArchiveResource::create(contentBuffer, location, mimeHeader.contentType(), mimeHeader.charset(), String()); 219 } 220 221 size_t MHTMLParser::frameCount() const 222 { 223 return m_frames.size(); 224 } 225 226 MHTMLArchive* MHTMLParser::frameAt(size_t index) const 227 { 228 return m_frames[index].get(); 229 } 230 231 size_t MHTMLParser::subResourceCount() const 232 { 233 return m_resources.size(); 234 } 235 236 ArchiveResource* MHTMLParser::subResourceAt(size_t index) const 237 { 238 return m_resources[index].get(); 239 } 240 241 } 242