1 /* 2 * Copyright (C) 2009 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 // How we handle the base tag better. 32 // Current status: 33 // At now the normal way we use to handling base tag is 34 // a) For those links which have corresponding local saved files, such as 35 // savable CSS, JavaScript files, they will be written to relative URLs which 36 // point to local saved file. Why those links can not be resolved as absolute 37 // file URLs, because if they are resolved as absolute URLs, after moving the 38 // file location from one directory to another directory, the file URLs will 39 // be dead links. 40 // b) For those links which have not corresponding local saved files, such as 41 // links in A, AREA tags, they will be resolved as absolute URLs. 42 // c) We comment all base tags when serialzing DOM for the page. 43 // FireFox also uses above way to handle base tag. 44 // 45 // Problem: 46 // This way can not handle the following situation: 47 // the base tag is written by JavaScript. 48 // For example. The page "www.yahoo.com" use 49 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL 50 // of page when loading page. So when saving page as completed-HTML, we assume 51 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved 52 // completed-HTML page, then the JavaScript will insert a base tag 53 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to 54 // local saved resource files will be resolved as 55 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource 56 // files can not be loaded correctly. Also the page will be rendered ugly since 57 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame 58 // files can not be fetched. 59 // Now FireFox, IE and WebKit based Browser all have this problem. 60 // 61 // Solution: 62 // My solution is that we comment old base tag and write new base tag: 63 // <base href="." ...> after the previous commented base tag. In WebKit, it 64 // always uses the latest "href" attribute of base tag to set document's base 65 // URL. Based on this behavior, when we encounter a base tag, we comment it and 66 // write a new base tag <base href="."> after the previous commented base tag. 67 // The new added base tag can help engine to locate correct base URL for 68 // correctly loading local saved resource files. Also I think we need to inherit 69 // the base target value from document object when appending new base tag. 70 // If there are multiple base tags in original document, we will comment all old 71 // base tags and append new base tag after each old base tag because we do not 72 // know those old base tags are original content or added by JavaScript. If 73 // they are added by JavaScript, it means when loading saved page, the script(s) 74 // will still insert base tag(s) to DOM, so the new added base tag(s) can 75 // override the incorrect base URL and make sure we alway load correct local 76 // saved resource files. 77 78 #include "config.h" 79 #include "WebPageSerializerImpl.h" 80 81 #include "Document.h" 82 #include "DocumentType.h" 83 #include "Element.h" 84 #include "FrameLoader.h" 85 #include "HTMLAllCollection.h" 86 #include "HTMLElement.h" 87 #include "HTMLFormElement.h" 88 #include "HTMLMetaElement.h" 89 #include "HTMLNames.h" 90 #include "KURL.h" 91 #include "PlatformString.h" 92 #include "StringBuilder.h" 93 #include "TextEncoding.h" 94 #include "markup.h" 95 96 #include "DOMUtilitiesPrivate.h" 97 #include "WebFrameImpl.h" 98 #include "WebURL.h" 99 #include "WebVector.h" 100 101 using namespace WebCore; 102 103 namespace WebKit { 104 105 // Maximum length of data buffer which is used to temporary save generated 106 // html content data. This is a soft limit which might be passed if a very large 107 // contegious string is found in the page. 108 static const unsigned dataBufferCapacity = 65536; 109 110 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& currentFrameURL, 111 const TextEncoding& textEncoding, 112 Document* doc, 113 const String& directoryName) 114 : currentFrameURL(currentFrameURL) 115 , textEncoding(textEncoding) 116 , doc(doc) 117 , directoryName(directoryName) 118 , hasDoctype(false) 119 , hasCheckedMeta(false) 120 , skipMetaElement(0) 121 , isInScriptOrStyleTag(false) 122 , hasDocDeclaration(false) 123 { 124 // Cache the value since we check it lots of times. 125 isHTMLDocument = doc->isHTMLDocument(); 126 } 127 128 String WebPageSerializerImpl::preActionBeforeSerializeOpenTag( 129 const Element* element, SerializeDomParam* param, bool* needSkip) 130 { 131 StringBuilder result; 132 133 *needSkip = false; 134 if (param->isHTMLDocument) { 135 // Skip the open tag of original META tag which declare charset since we 136 // have overrided the META which have correct charset declaration after 137 // serializing open tag of HEAD element. 138 if (element->hasTagName(HTMLNames::metaTag)) { 139 const HTMLMetaElement* meta = static_cast<const HTMLMetaElement*>(element); 140 // Check whether the META tag has declared charset or not. 141 String equiv = meta->httpEquiv(); 142 if (equalIgnoringCase(equiv, "content-type")) { 143 String content = meta->content(); 144 if (content.length() && content.contains("charset", false)) { 145 // Find META tag declared charset, we need to skip it when 146 // serializing DOM. 147 param->skipMetaElement = element; 148 *needSkip = true; 149 } 150 } 151 } else if (element->hasTagName(HTMLNames::htmlTag)) { 152 // Check something before processing the open tag of HEAD element. 153 // First we add doc type declaration if original doc has it. 154 if (!param->hasDoctype) { 155 param->hasDoctype = true; 156 result.append(createMarkup(param->doc->doctype())); 157 } 158 159 // Add MOTW declaration before html tag. 160 // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx. 161 result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->currentFrameURL)); 162 } else if (element->hasTagName(HTMLNames::baseTag)) { 163 // Comment the BASE tag when serializing dom. 164 result.append("<!--"); 165 } 166 } else { 167 // Write XML declaration. 168 if (!param->hasDocDeclaration) { 169 param->hasDocDeclaration = true; 170 // Get encoding info. 171 String xmlEncoding = param->doc->xmlEncoding(); 172 if (xmlEncoding.isEmpty()) 173 xmlEncoding = param->doc->frame()->loader()->encoding(); 174 if (xmlEncoding.isEmpty()) 175 xmlEncoding = UTF8Encoding().name(); 176 result.append("<?xml version=\""); 177 result.append(param->doc->xmlVersion()); 178 result.append("\" encoding=\""); 179 result.append(xmlEncoding); 180 if (param->doc->xmlStandalone()) 181 result.append("\" standalone=\"yes"); 182 result.append("\"?>\n"); 183 } 184 // Add doc type declaration if original doc has it. 185 if (!param->hasDoctype) { 186 param->hasDoctype = true; 187 result.append(createMarkup(param->doc->doctype())); 188 } 189 } 190 return result.toString(); 191 } 192 193 String WebPageSerializerImpl::postActionAfterSerializeOpenTag( 194 const Element* element, SerializeDomParam* param) 195 { 196 StringBuilder result; 197 198 param->hasAddedContentsBeforeEnd = false; 199 if (!param->isHTMLDocument) 200 return result.toString(); 201 // Check after processing the open tag of HEAD element 202 if (!param->hasCheckedMeta 203 && element->hasTagName(HTMLNames::headTag)) { 204 param->hasCheckedMeta = true; 205 // Check meta element. WebKit only pre-parse the first 512 bytes 206 // of the document. If the whole <HEAD> is larger and meta is the 207 // end of head part, then this kind of pages aren't decoded correctly 208 // because of this issue. So when we serialize the DOM, we need to 209 // make sure the meta will in first child of head tag. 210 // See http://bugs.webkit.org/show_bug.cgi?id=16621. 211 // First we generate new content for writing correct META element. 212 result.append(WebPageSerializer::generateMetaCharsetDeclaration( 213 String(param->textEncoding.name()))); 214 215 param->hasAddedContentsBeforeEnd = true; 216 // Will search each META which has charset declaration, and skip them all 217 // in PreActionBeforeSerializeOpenTag. 218 } else if (element->hasTagName(HTMLNames::scriptTag) 219 || element->hasTagName(HTMLNames::styleTag)) { 220 param->isInScriptOrStyleTag = true; 221 } 222 223 return result.toString(); 224 } 225 226 String WebPageSerializerImpl::preActionBeforeSerializeEndTag( 227 const Element* element, SerializeDomParam* param, bool* needSkip) 228 { 229 String result; 230 231 *needSkip = false; 232 if (!param->isHTMLDocument) 233 return result; 234 // Skip the end tag of original META tag which declare charset. 235 // Need not to check whether it's META tag since we guarantee 236 // skipMetaElement is definitely META tag if it's not 0. 237 if (param->skipMetaElement == element) 238 *needSkip = true; 239 else if (element->hasTagName(HTMLNames::scriptTag) 240 || element->hasTagName(HTMLNames::styleTag)) { 241 ASSERT(param->isInScriptOrStyleTag); 242 param->isInScriptOrStyleTag = false; 243 } 244 245 return result; 246 } 247 248 // After we finish serializing end tag of a element, we give the target 249 // element a chance to do some post work to add some additional data. 250 String WebPageSerializerImpl::postActionAfterSerializeEndTag( 251 const Element* element, SerializeDomParam* param) 252 { 253 StringBuilder result; 254 255 if (!param->isHTMLDocument) 256 return result.toString(); 257 // Comment the BASE tag when serializing DOM. 258 if (element->hasTagName(HTMLNames::baseTag)) { 259 result.append("-->"); 260 // Append a new base tag declaration. 261 result.append(WebPageSerializer::generateBaseTagDeclaration( 262 param->doc->baseTarget())); 263 } 264 265 return result.toString(); 266 } 267 268 void WebPageSerializerImpl::saveHTMLContentToBuffer( 269 const String& result, SerializeDomParam* param) 270 { 271 m_dataBuffer.append(result); 272 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished, 273 param, 274 0); 275 } 276 277 void WebPageSerializerImpl::encodeAndFlushBuffer( 278 WebPageSerializerClient::PageSerializationStatus status, 279 SerializeDomParam* param, 280 bool force) 281 { 282 // Data buffer is not full nor do we want to force flush. 283 if (!force && m_dataBuffer.length() <= dataBufferCapacity) 284 return; 285 286 String content = m_dataBuffer.toString(); 287 m_dataBuffer.clear(); 288 289 // Convert the unicode content to target encoding 290 CString encodedContent = param->textEncoding.encode( 291 content.characters(), content.length(), EntitiesForUnencodables); 292 293 // Send result to the client. 294 m_client->didSerializeDataForFrame(param->currentFrameURL, 295 WebCString(encodedContent.data(), encodedContent.length()), 296 status); 297 } 298 299 void WebPageSerializerImpl::openTagToString(const Element* element, 300 SerializeDomParam* param) 301 { 302 // FIXME: use StringBuilder instead of String. 303 bool needSkip; 304 // Do pre action for open tag. 305 String result = preActionBeforeSerializeOpenTag(element, param, &needSkip); 306 if (needSkip) 307 return; 308 // Add open tag 309 result += "<" + element->nodeName(); 310 // Go through all attributes and serialize them. 311 const NamedNodeMap *attrMap = element->attributes(true); 312 if (attrMap) { 313 unsigned numAttrs = attrMap->length(); 314 for (unsigned i = 0; i < numAttrs; i++) { 315 result += " "; 316 // Add attribute pair 317 const Attribute *attribute = attrMap->attributeItem(i); 318 result += attribute->name().toString(); 319 result += "=\""; 320 if (!attribute->value().isEmpty()) { 321 const String& attrValue = attribute->value(); 322 323 // Check whether we need to replace some resource links 324 // with local resource paths. 325 const QualifiedName& attrName = attribute->name(); 326 if (elementHasLegalLinkAttribute(element, attrName)) { 327 // For links start with "javascript:", we do not change it. 328 if (attrValue.startsWith("javascript:", false)) 329 result += attrValue; 330 else { 331 // Get the absolute link 332 String completeURL = param->doc->completeURL(attrValue); 333 // Check whether we have local files for those link. 334 if (m_localLinks.contains(completeURL)) { 335 if (!m_localDirectoryName.isEmpty()) 336 result += "./" + m_localDirectoryName + "/"; 337 result += m_localLinks.get(completeURL); 338 } else 339 result += completeURL; 340 } 341 } else { 342 if (param->isHTMLDocument) 343 result += m_htmlEntities.convertEntitiesInString(attrValue); 344 else 345 result += m_xmlEntities.convertEntitiesInString(attrValue); 346 } 347 } 348 result += "\""; 349 } 350 } 351 352 // Do post action for open tag. 353 String addedContents = postActionAfterSerializeOpenTag(element, param); 354 // Complete the open tag for element when it has child/children. 355 if (element->hasChildNodes() || param->hasAddedContentsBeforeEnd) 356 result += ">"; 357 // Append the added contents generate in post action of open tag. 358 result += addedContents; 359 // Save the result to data buffer. 360 saveHTMLContentToBuffer(result, param); 361 } 362 363 // Serialize end tag of an specified element. 364 void WebPageSerializerImpl::endTagToString(const Element* element, 365 SerializeDomParam* param) 366 { 367 bool needSkip; 368 // Do pre action for end tag. 369 String result = preActionBeforeSerializeEndTag(element, 370 param, 371 &needSkip); 372 if (needSkip) 373 return; 374 // Write end tag when element has child/children. 375 if (element->hasChildNodes() || param->hasAddedContentsBeforeEnd) { 376 result += "</"; 377 result += element->nodeName(); 378 result += ">"; 379 } else { 380 // Check whether we have to write end tag for empty element. 381 if (param->isHTMLDocument) { 382 result += ">"; 383 const HTMLElement* htmlElement = 384 static_cast<const HTMLElement*>(element); 385 if (htmlElement->endTagRequirement() == TagStatusRequired) { 386 // We need to write end tag when it is required. 387 result += "</"; 388 result += element->nodeName(); 389 result += ">"; 390 } 391 } else { 392 // For xml base document. 393 result += " />"; 394 } 395 } 396 // Do post action for end tag. 397 result += postActionAfterSerializeEndTag(element, param); 398 // Save the result to data buffer. 399 saveHTMLContentToBuffer(result, param); 400 } 401 402 void WebPageSerializerImpl::buildContentForNode(const Node* node, 403 SerializeDomParam* param) 404 { 405 switch (node->nodeType()) { 406 case Node::ELEMENT_NODE: 407 // Process open tag of element. 408 openTagToString(static_cast<const Element*>(node), param); 409 // Walk through the children nodes and process it. 410 for (const Node *child = node->firstChild(); child; child = child->nextSibling()) 411 buildContentForNode(child, param); 412 // Process end tag of element. 413 endTagToString(static_cast<const Element*>(node), param); 414 break; 415 case Node::TEXT_NODE: 416 saveHTMLContentToBuffer(createMarkup(node), param); 417 break; 418 case Node::ATTRIBUTE_NODE: 419 case Node::DOCUMENT_NODE: 420 case Node::DOCUMENT_FRAGMENT_NODE: 421 // Should not exist. 422 ASSERT_NOT_REACHED(); 423 break; 424 // Document type node can be in DOM? 425 case Node::DOCUMENT_TYPE_NODE: 426 param->hasDoctype = true; 427 default: 428 // For other type node, call default action. 429 saveHTMLContentToBuffer(createMarkup(node), param); 430 break; 431 } 432 } 433 434 WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame, 435 bool recursiveSerialization, 436 WebPageSerializerClient* client, 437 const WebVector<WebURL>& links, 438 const WebVector<WebString>& localPaths, 439 const WebString& localDirectoryName) 440 : m_client(client) 441 , m_recursiveSerialization(recursiveSerialization) 442 , m_framesCollected(false) 443 , m_localDirectoryName(localDirectoryName) 444 , m_htmlEntities(false) 445 , m_xmlEntities(true) 446 { 447 // Must specify available webframe. 448 ASSERT(frame); 449 m_specifiedWebFrameImpl = static_cast<WebFrameImpl*>(frame); 450 // Make sure we have non 0 client. 451 ASSERT(client); 452 // Build local resources map. 453 ASSERT(links.size() == localPaths.size()); 454 for (size_t i = 0; i < links.size(); i++) { 455 KURL url = links[i]; 456 ASSERT(!m_localLinks.contains(url.string())); 457 m_localLinks.set(url.string(), localPaths[i]); 458 } 459 460 ASSERT(!m_dataBuffer.length()); 461 } 462 463 void WebPageSerializerImpl::collectTargetFrames() 464 { 465 ASSERT(!m_framesCollected); 466 m_framesCollected = true; 467 468 // First, process main frame. 469 m_frames.append(m_specifiedWebFrameImpl); 470 // Return now if user only needs to serialize specified frame, not including 471 // all sub-frames. 472 if (!m_recursiveSerialization) 473 return; 474 // Collect all frames inside the specified frame. 475 for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) { 476 WebFrameImpl* currentFrame = m_frames[i]; 477 // Get current using document. 478 Document* currentDoc = currentFrame->frame()->document(); 479 // Go through sub-frames. 480 RefPtr<HTMLAllCollection> all = currentDoc->all(); 481 for (Node* node = all->firstItem(); node; node = all->nextItem()) { 482 if (!node->isHTMLElement()) 483 continue; 484 Element* element = static_cast<Element*>(node); 485 WebFrameImpl* webFrame = 486 WebFrameImpl::fromFrameOwnerElement(element); 487 if (webFrame) 488 m_frames.append(webFrame); 489 } 490 } 491 } 492 493 bool WebPageSerializerImpl::serialize() 494 { 495 // Collect target frames. 496 if (!m_framesCollected) 497 collectTargetFrames(); 498 bool didSerialization = false; 499 // Get KURL for main frame. 500 KURL mainPageURL = m_specifiedWebFrameImpl->frame()->loader()->url(); 501 502 // Go through all frames for serializing DOM for whole page, include 503 // sub-frames. 504 for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) { 505 // Get current serializing frame. 506 WebFrameImpl* currentFrame = m_frames[i]; 507 // Get current using document. 508 Document* currentDoc = currentFrame->frame()->document(); 509 // Get current frame's URL. 510 const KURL& currentFrameURL = currentFrame->frame()->loader()->url(); 511 512 // Check whether we have done this document. 513 if (m_localLinks.contains(currentFrameURL.string())) { 514 // A new document, we will serialize it. 515 didSerialization = true; 516 // Get target encoding for current document. 517 String encoding = currentFrame->frame()->loader()->encoding(); 518 // Create the text encoding object with target encoding. 519 TextEncoding textEncoding(encoding); 520 // Construct serialize parameter for late processing document. 521 SerializeDomParam param(currentFrameURL, 522 encoding.length() ? textEncoding : UTF8Encoding(), 523 currentDoc, 524 currentFrameURL == mainPageURL ? m_localDirectoryName : ""); 525 526 // Process current document. 527 Element* rootElement = currentDoc->documentElement(); 528 if (rootElement) 529 buildContentForNode(rootElement, ¶m); 530 531 // Flush the remainder data and finish serializing current frame. 532 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, 533 ¶m, 534 1); 535 } 536 } 537 538 // We have done call frames, so we send message to embedder to tell it that 539 // frames are finished serializing. 540 ASSERT(!m_dataBuffer.length()); 541 m_client->didSerializeDataForFrame(KURL(), 542 WebCString("", 0), 543 WebPageSerializerClient::AllFramesAreFinished); 544 return didSerialization; 545 } 546 547 } // namespace WebKit 548