Home | History | Annotate | Download | only in web
      1 /*
      2  * Copyright (C) 2009 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *     * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *     * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *     * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 // How we handle the base tag better.
     32 // Current status:
     33 // At now the normal way we use to handling base tag is
     34 // a) For those links which have corresponding local saved files, such as
     35 // savable CSS, JavaScript files, they will be written to relative URLs which
     36 // point to local saved file. Why those links can not be resolved as absolute
     37 // file URLs, because if they are resolved as absolute URLs, after moving the
     38 // file location from one directory to another directory, the file URLs will
     39 // be dead links.
     40 // b) For those links which have not corresponding local saved files, such as
     41 // links in A, AREA tags, they will be resolved as absolute URLs.
     42 // c) We comment all base tags when serialzing DOM for the page.
     43 // FireFox also uses above way to handle base tag.
     44 //
     45 // Problem:
     46 // This way can not handle the following situation:
     47 // the base tag is written by JavaScript.
     48 // For example. The page "www.yahoo.com" use
     49 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL
     50 // of page when loading page. So when saving page as completed-HTML, we assume
     51 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved
     52 // completed-HTML page, then the JavaScript will insert a base tag
     53 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to
     54 // local saved resource files will be resolved as
     55 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved  resource
     56 // files can not be loaded correctly. Also the page will be rendered ugly since
     57 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
     58 // files can not be fetched.
     59 // Now FireFox, IE and WebKit based Browser all have this problem.
     60 //
     61 // Solution:
     62 // My solution is that we comment old base tag and write new base tag:
     63 // <base href="." ...> after the previous commented base tag. In WebKit, it
     64 // always uses the latest "href" attribute of base tag to set document's base
     65 // URL. Based on this behavior, when we encounter a base tag, we comment it and
     66 // write a new base tag <base href="."> after the previous commented base tag.
     67 // The new added base tag can help engine to locate correct base URL for
     68 // correctly loading local saved resource files. Also I think we need to inherit
     69 // the base target value from document object when appending new base tag.
     70 // If there are multiple base tags in original document, we will comment all old
     71 // base tags and append new base tag after each old base tag because we do not
     72 // know those old base tags are original content or added by JavaScript. If
     73 // they are added by JavaScript, it means when loading saved page, the script(s)
     74 // will still insert base tag(s) to DOM, so the new added base tag(s) can
     75 // override the incorrect base URL and make sure we alway load correct local
     76 // saved resource files.
     77 
     78 #include "config.h"
     79 #include "WebPageSerializerImpl.h"
     80 
     81 #include "DOMUtilitiesPrivate.h"
     82 #include "HTMLNames.h"
     83 #include "WebFrameImpl.h"
     84 #include "core/dom/Document.h"
     85 #include "core/dom/DocumentType.h"
     86 #include "core/dom/Element.h"
     87 #include "core/editing/markup.h"
     88 #include "core/html/HTMLAllCollection.h"
     89 #include "core/html/HTMLElement.h"
     90 #include "core/html/HTMLFormElement.h"
     91 #include "core/html/HTMLHtmlElement.h"
     92 #include "core/html/HTMLMetaElement.h"
     93 #include "core/loader/DocumentLoader.h"
     94 #include "core/loader/FrameLoader.h"
     95 #include "public/platform/WebURL.h"
     96 #include "public/platform/WebVector.h"
     97 #include "weborigin/KURL.h"
     98 #include "wtf/text/TextEncoding.h"
     99 
    100 using namespace WebCore;
    101 
    102 namespace WebKit {
    103 
    104 // Maximum length of data buffer which is used to temporary save generated
    105 // html content data. This is a soft limit which might be passed if a very large
    106 // contegious string is found in the page.
    107 static const unsigned dataBufferCapacity = 65536;
    108 
    109 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url,
    110                                                             const WTF::TextEncoding& textEncoding,
    111                                                             Document* document,
    112                                                             const String& directoryName)
    113     : url(url)
    114     , textEncoding(textEncoding)
    115     , document(document)
    116     , directoryName(directoryName)
    117     , isHTMLDocument(document->isHTMLDocument())
    118     , haveSeenDocType(false)
    119     , haveAddedCharsetDeclaration(false)
    120     , skipMetaElement(0)
    121     , isInScriptOrStyleTag(false)
    122     , haveAddedXMLProcessingDirective(false)
    123     , haveAddedContentsBeforeEnd(false)
    124 {
    125 }
    126 
    127 String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(
    128     const Element* element, SerializeDomParam* param, bool* needSkip)
    129 {
    130     StringBuilder result;
    131 
    132     *needSkip = false;
    133     if (param->isHTMLDocument) {
    134         // Skip the open tag of original META tag which declare charset since we
    135         // have overrided the META which have correct charset declaration after
    136         // serializing open tag of HEAD element.
    137         if (element->hasTagName(HTMLNames::metaTag)) {
    138             const HTMLMetaElement* meta = static_cast<const HTMLMetaElement*>(element);
    139             // Check whether the META tag has declared charset or not.
    140             String equiv = meta->httpEquiv();
    141             if (equalIgnoringCase(equiv, "content-type")) {
    142                 String content = meta->content();
    143                 if (content.length() && content.contains("charset", false)) {
    144                     // Find META tag declared charset, we need to skip it when
    145                     // serializing DOM.
    146                     param->skipMetaElement = element;
    147                     *needSkip = true;
    148                 }
    149             }
    150         } else if (isHTMLHtmlElement(element)) {
    151             // Check something before processing the open tag of HEAD element.
    152             // First we add doc type declaration if original document has it.
    153             if (!param->haveSeenDocType) {
    154                 param->haveSeenDocType = true;
    155                 result.append(createMarkup(param->document->doctype()));
    156             }
    157 
    158             // Add MOTW declaration before html tag.
    159             // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
    160             result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url));
    161         } else if (element->hasTagName(HTMLNames::baseTag)) {
    162             // Comment the BASE tag when serializing dom.
    163             result.append("<!--");
    164         }
    165     } else {
    166         // Write XML declaration.
    167         if (!param->haveAddedXMLProcessingDirective) {
    168             param->haveAddedXMLProcessingDirective = true;
    169             // Get encoding info.
    170             String xmlEncoding = param->document->xmlEncoding();
    171             if (xmlEncoding.isEmpty())
    172                 xmlEncoding = param->document->encoding();
    173             if (xmlEncoding.isEmpty())
    174                 xmlEncoding = UTF8Encoding().name();
    175             result.append("<?xml version=\"");
    176             result.append(param->document->xmlVersion());
    177             result.append("\" encoding=\"");
    178             result.append(xmlEncoding);
    179             if (param->document->xmlStandalone())
    180                 result.append("\" standalone=\"yes");
    181             result.append("\"?>\n");
    182         }
    183         // Add doc type declaration if original document has it.
    184         if (!param->haveSeenDocType) {
    185             param->haveSeenDocType = true;
    186             result.append(createMarkup(param->document->doctype()));
    187         }
    188     }
    189     return result.toString();
    190 }
    191 
    192 String WebPageSerializerImpl::postActionAfterSerializeOpenTag(
    193     const Element* element, SerializeDomParam* param)
    194 {
    195     StringBuilder result;
    196 
    197     param->haveAddedContentsBeforeEnd = false;
    198     if (!param->isHTMLDocument)
    199         return result.toString();
    200     // Check after processing the open tag of HEAD element
    201     if (!param->haveAddedCharsetDeclaration
    202         && element->hasTagName(HTMLNames::headTag)) {
    203         param->haveAddedCharsetDeclaration = true;
    204         // Check meta element. WebKit only pre-parse the first 512 bytes
    205         // of the document. If the whole <HEAD> is larger and meta is the
    206         // end of head part, then this kind of pages aren't decoded correctly
    207         // because of this issue. So when we serialize the DOM, we need to
    208         // make sure the meta will in first child of head tag.
    209         // See http://bugs.webkit.org/show_bug.cgi?id=16621.
    210         // First we generate new content for writing correct META element.
    211         result.append(WebPageSerializer::generateMetaCharsetDeclaration(
    212             String(param->textEncoding.name())));
    213 
    214         param->haveAddedContentsBeforeEnd = true;
    215         // Will search each META which has charset declaration, and skip them all
    216         // in PreActionBeforeSerializeOpenTag.
    217     } else if (element->hasTagName(HTMLNames::scriptTag)
    218                || element->hasTagName(HTMLNames::styleTag)) {
    219         param->isInScriptOrStyleTag = true;
    220     }
    221 
    222     return result.toString();
    223 }
    224 
    225 String WebPageSerializerImpl::preActionBeforeSerializeEndTag(
    226     const Element* element, SerializeDomParam* param, bool* needSkip)
    227 {
    228     String result;
    229 
    230     *needSkip = false;
    231     if (!param->isHTMLDocument)
    232         return result;
    233     // Skip the end tag of original META tag which declare charset.
    234     // Need not to check whether it's META tag since we guarantee
    235     // skipMetaElement is definitely META tag if it's not 0.
    236     if (param->skipMetaElement == element)
    237         *needSkip = true;
    238     else if (element->hasTagName(HTMLNames::scriptTag)
    239              || element->hasTagName(HTMLNames::styleTag)) {
    240         ASSERT(param->isInScriptOrStyleTag);
    241         param->isInScriptOrStyleTag = false;
    242     }
    243 
    244     return result;
    245 }
    246 
    247 // After we finish serializing end tag of a element, we give the target
    248 // element a chance to do some post work to add some additional data.
    249 String WebPageSerializerImpl::postActionAfterSerializeEndTag(
    250     const Element* element, SerializeDomParam* param)
    251 {
    252     StringBuilder result;
    253 
    254     if (!param->isHTMLDocument)
    255         return result.toString();
    256     // Comment the BASE tag when serializing DOM.
    257     if (element->hasTagName(HTMLNames::baseTag)) {
    258         result.append("-->");
    259         // Append a new base tag declaration.
    260         result.append(WebPageSerializer::generateBaseTagDeclaration(
    261             param->document->baseTarget()));
    262     }
    263 
    264     return result.toString();
    265 }
    266 
    267 void WebPageSerializerImpl::saveHTMLContentToBuffer(
    268     const String& result, SerializeDomParam* param)
    269 {
    270     m_dataBuffer.append(result);
    271     encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,
    272                          param,
    273                          DoNotForceFlush);
    274 }
    275 
    276 void WebPageSerializerImpl::encodeAndFlushBuffer(
    277     WebPageSerializerClient::PageSerializationStatus status,
    278     SerializeDomParam* param,
    279     FlushOption flushOption)
    280 {
    281     // Data buffer is not full nor do we want to force flush.
    282     if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity)
    283         return;
    284 
    285     String content = m_dataBuffer.toString();
    286     m_dataBuffer.clear();
    287 
    288     CString encodedContent = param->textEncoding.normalizeAndEncode(content, WTF::EntitiesForUnencodables);
    289 
    290     // Send result to the client.
    291     m_client->didSerializeDataForFrame(param->url,
    292                                        WebCString(encodedContent.data(), encodedContent.length()),
    293                                        status);
    294 }
    295 
    296 void WebPageSerializerImpl::openTagToString(Element* element,
    297                                             SerializeDomParam* param)
    298 {
    299     bool needSkip;
    300     StringBuilder result;
    301     // Do pre action for open tag.
    302     result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip));
    303     if (needSkip)
    304         return;
    305     // Add open tag
    306     result.append('<');
    307     result.append(element->nodeName().lower());
    308     // Go through all attributes and serialize them.
    309     if (element->hasAttributes()) {
    310         unsigned numAttrs = element->attributeCount();
    311         for (unsigned i = 0; i < numAttrs; i++) {
    312             result.append(' ');
    313             // Add attribute pair
    314             const Attribute *attribute = element->attributeItem(i);
    315             result.append(attribute->name().toString());
    316             result.appendLiteral("=\"");
    317             if (!attribute->value().isEmpty()) {
    318                 const String& attrValue = attribute->value();
    319 
    320                 // Check whether we need to replace some resource links
    321                 // with local resource paths.
    322                 const QualifiedName& attrName = attribute->name();
    323                 if (elementHasLegalLinkAttribute(element, attrName)) {
    324                     // For links start with "javascript:", we do not change it.
    325                     if (attrValue.startsWith("javascript:", false))
    326                         result.append(attrValue);
    327                     else {
    328                         // Get the absolute link
    329                         WebFrameImpl* subFrame = WebFrameImpl::fromFrameOwnerElement(element);
    330                         String completeURL = subFrame ? subFrame->frame()->document()->url() :
    331                                                         param->document->completeURL(attrValue);
    332                         // Check whether we have local files for those link.
    333                         if (m_localLinks.contains(completeURL)) {
    334                             if (!param->directoryName.isEmpty()) {
    335                                 result.appendLiteral("./");
    336                                 result.append(param->directoryName);
    337                                 result.append('/');
    338                             }
    339                             result.append(m_localLinks.get(completeURL));
    340                         } else
    341                             result.append(completeURL);
    342                     }
    343                 } else {
    344                     if (param->isHTMLDocument)
    345                         result.append(m_htmlEntities.convertEntitiesInString(attrValue));
    346                     else
    347                         result.append(m_xmlEntities.convertEntitiesInString(attrValue));
    348                 }
    349             }
    350             result.append('\"');
    351         }
    352     }
    353 
    354     // Do post action for open tag.
    355     String addedContents = postActionAfterSerializeOpenTag(element, param);
    356     // Complete the open tag for element when it has child/children.
    357     if (element->hasChildNodes() || param->haveAddedContentsBeforeEnd)
    358         result.append('>');
    359     // Append the added contents generate in  post action of open tag.
    360     result.append(addedContents);
    361     // Save the result to data buffer.
    362     saveHTMLContentToBuffer(result.toString(), param);
    363 }
    364 
    365 // Serialize end tag of an specified element.
    366 void WebPageSerializerImpl::endTagToString(Element* element,
    367                                            SerializeDomParam* param)
    368 {
    369     bool needSkip;
    370     StringBuilder result;
    371     // Do pre action for end tag.
    372     result.append(preActionBeforeSerializeEndTag(element, param, &needSkip));
    373     if (needSkip)
    374         return;
    375     // Write end tag when element has child/children.
    376     if (element->hasChildNodes() || param->haveAddedContentsBeforeEnd) {
    377         result.appendLiteral("</");
    378         result.append(element->nodeName().lower());
    379         result.append('>');
    380     } else {
    381         // Check whether we have to write end tag for empty element.
    382         if (param->isHTMLDocument) {
    383             result.append('>');
    384             // FIXME: This code is horribly wrong.  WebPageSerializerImpl must die.
    385             if (!element->isHTMLElement() || !toHTMLElement(element)->ieForbidsInsertHTML()) {
    386                 // We need to write end tag when it is required.
    387                 result.appendLiteral("</");
    388                 result.append(element->nodeName().lower());
    389                 result.append('>');
    390             }
    391         } else {
    392             // For xml base document.
    393             result.appendLiteral(" />");
    394         }
    395     }
    396     // Do post action for end tag.
    397     result.append(postActionAfterSerializeEndTag(element, param));
    398     // Save the result to data buffer.
    399     saveHTMLContentToBuffer(result.toString(), param);
    400 }
    401 
    402 void WebPageSerializerImpl::buildContentForNode(Node* node,
    403                                                 SerializeDomParam* param)
    404 {
    405     switch (node->nodeType()) {
    406     case Node::ELEMENT_NODE:
    407         // Process open tag of element.
    408         openTagToString(toElement(node), param);
    409         // Walk through the children nodes and process it.
    410         for (Node *child = node->firstChild(); child; child = child->nextSibling())
    411             buildContentForNode(child, param);
    412         // Process end tag of element.
    413         endTagToString(toElement(node), param);
    414         break;
    415     case Node::TEXT_NODE:
    416         saveHTMLContentToBuffer(createMarkup(node), param);
    417         break;
    418     case Node::ATTRIBUTE_NODE:
    419     case Node::DOCUMENT_NODE:
    420     case Node::DOCUMENT_FRAGMENT_NODE:
    421         // Should not exist.
    422         ASSERT_NOT_REACHED();
    423         break;
    424     // Document type node can be in DOM?
    425     case Node::DOCUMENT_TYPE_NODE:
    426         param->haveSeenDocType = true;
    427     default:
    428         // For other type node, call default action.
    429         saveHTMLContentToBuffer(createMarkup(node), param);
    430         break;
    431     }
    432 }
    433 
    434 WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,
    435                                              bool recursiveSerialization,
    436                                              WebPageSerializerClient* client,
    437                                              const WebVector<WebURL>& links,
    438                                              const WebVector<WebString>& localPaths,
    439                                              const WebString& localDirectoryName)
    440     : m_client(client)
    441     , m_recursiveSerialization(recursiveSerialization)
    442     , m_framesCollected(false)
    443     , m_localDirectoryName(localDirectoryName)
    444     , m_htmlEntities(false)
    445     , m_xmlEntities(true)
    446 {
    447     // Must specify available webframe.
    448     ASSERT(frame);
    449     m_specifiedWebFrameImpl = static_cast<WebFrameImpl*>(frame);
    450     // Make sure we have non 0 client.
    451     ASSERT(client);
    452     // Build local resources map.
    453     ASSERT(links.size() == localPaths.size());
    454     for (size_t i = 0; i < links.size(); i++) {
    455         KURL url = links[i];
    456         ASSERT(!m_localLinks.contains(url.string()));
    457         m_localLinks.set(url.string(), localPaths[i]);
    458     }
    459 
    460     ASSERT(m_dataBuffer.isEmpty());
    461 }
    462 
    463 void WebPageSerializerImpl::collectTargetFrames()
    464 {
    465     ASSERT(!m_framesCollected);
    466     m_framesCollected = true;
    467 
    468     // First, process main frame.
    469     m_frames.append(m_specifiedWebFrameImpl);
    470     // Return now if user only needs to serialize specified frame, not including
    471     // all sub-frames.
    472     if (!m_recursiveSerialization)
    473         return;
    474     // Collect all frames inside the specified frame.
    475     for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
    476         WebFrameImpl* currentFrame = m_frames[i];
    477         // Get current using document.
    478         Document* currentDoc = currentFrame->frame()->document();
    479         // Go through sub-frames.
    480         RefPtr<HTMLCollection> all = currentDoc->all();
    481 
    482         for (unsigned i = 0; Node* node = all->item(i); i++) {
    483             if (!node->isHTMLElement())
    484                 continue;
    485             Element* element = toElement(node);
    486             WebFrameImpl* webFrame =
    487                 WebFrameImpl::fromFrameOwnerElement(element);
    488             if (webFrame)
    489                 m_frames.append(webFrame);
    490         }
    491     }
    492 }
    493 
    494 bool WebPageSerializerImpl::serialize()
    495 {
    496     if (!m_framesCollected)
    497         collectTargetFrames();
    498 
    499     bool didSerialization = false;
    500     KURL mainURL = m_specifiedWebFrameImpl->frame()->document()->url();
    501 
    502     for (unsigned i = 0; i < m_frames.size(); ++i) {
    503         WebFrameImpl* webFrame = m_frames[i];
    504         Document* document = webFrame->frame()->document();
    505         const KURL& url = document->url();
    506 
    507         if (!url.isValid() || !m_localLinks.contains(url.string()))
    508             continue;
    509 
    510         didSerialization = true;
    511 
    512         String encoding = document->encoding();
    513         const WTF::TextEncoding& textEncoding = encoding.isEmpty() ? UTF8Encoding() : WTF::TextEncoding(encoding);
    514         String directoryName = url == mainURL ? m_localDirectoryName : "";
    515 
    516         SerializeDomParam param(url, textEncoding, document, directoryName);
    517 
    518         Element* documentElement = document->documentElement();
    519         if (documentElement)
    520             buildContentForNode(documentElement, &param);
    521 
    522         encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, &param, ForceFlush);
    523     }
    524 
    525     ASSERT(m_dataBuffer.isEmpty());
    526     m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished);
    527     return didSerialization;
    528 }
    529 
    530 }  // namespace WebKit
    531