Home | History | Annotate | Download | only in src
      1 /*
      2  * Copyright (C) 2009 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *     * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *     * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *     * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 // How we handle the base tag better.
     32 // Current status:
     33 // At now the normal way we use to handling base tag is
     34 // a) For those links which have corresponding local saved files, such as
     35 // savable CSS, JavaScript files, they will be written to relative URLs which
     36 // point to local saved file. Why those links can not be resolved as absolute
     37 // file URLs, because if they are resolved as absolute URLs, after moving the
     38 // file location from one directory to another directory, the file URLs will
     39 // be dead links.
     40 // b) For those links which have not corresponding local saved files, such as
     41 // links in A, AREA tags, they will be resolved as absolute URLs.
     42 // c) We comment all base tags when serialzing DOM for the page.
     43 // FireFox also uses above way to handle base tag.
     44 //
     45 // Problem:
     46 // This way can not handle the following situation:
     47 // the base tag is written by JavaScript.
     48 // For example. The page "www.yahoo.com" use
     49 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL
     50 // of page when loading page. So when saving page as completed-HTML, we assume
     51 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved
     52 // completed-HTML page, then the JavaScript will insert a base tag
     53 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to
     54 // local saved resource files will be resolved as
     55 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved  resource
     56 // files can not be loaded correctly. Also the page will be rendered ugly since
     57 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
     58 // files can not be fetched.
     59 // Now FireFox, IE and WebKit based Browser all have this problem.
     60 //
     61 // Solution:
     62 // My solution is that we comment old base tag and write new base tag:
     63 // <base href="." ...> after the previous commented base tag. In WebKit, it
     64 // always uses the latest "href" attribute of base tag to set document's base
     65 // URL. Based on this behavior, when we encounter a base tag, we comment it and
     66 // write a new base tag <base href="."> after the previous commented base tag.
     67 // The new added base tag can help engine to locate correct base URL for
     68 // correctly loading local saved resource files. Also I think we need to inherit
     69 // the base target value from document object when appending new base tag.
     70 // If there are multiple base tags in original document, we will comment all old
     71 // base tags and append new base tag after each old base tag because we do not
     72 // know those old base tags are original content or added by JavaScript. If
     73 // they are added by JavaScript, it means when loading saved page, the script(s)
     74 // will still insert base tag(s) to DOM, so the new added base tag(s) can
     75 // override the incorrect base URL and make sure we alway load correct local
     76 // saved resource files.
     77 
     78 #include "config.h"
     79 #include "WebPageSerializerImpl.h"
     80 
     81 #include "Document.h"
     82 #include "DocumentType.h"
     83 #include "Element.h"
     84 #include "FrameLoader.h"
     85 #include "HTMLAllCollection.h"
     86 #include "HTMLElement.h"
     87 #include "HTMLFormElement.h"
     88 #include "HTMLMetaElement.h"
     89 #include "HTMLNames.h"
     90 #include "KURL.h"
     91 #include "PlatformString.h"
     92 #include "StringBuilder.h"
     93 #include "TextEncoding.h"
     94 #include "markup.h"
     95 
     96 #include "DOMUtilitiesPrivate.h"
     97 #include "WebFrameImpl.h"
     98 #include "WebURL.h"
     99 #include "WebVector.h"
    100 
    101 using namespace WebCore;
    102 
    103 namespace WebKit {
    104 
    105 // Maximum length of data buffer which is used to temporary save generated
    106 // html content data. This is a soft limit which might be passed if a very large
    107 // contegious string is found in the page.
    108 static const unsigned dataBufferCapacity = 65536;
    109 
    110 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& currentFrameURL,
    111                                                             const TextEncoding& textEncoding,
    112                                                             Document* doc,
    113                                                             const String& directoryName)
    114     : currentFrameURL(currentFrameURL)
    115     , textEncoding(textEncoding)
    116     , doc(doc)
    117     , directoryName(directoryName)
    118     , hasDoctype(false)
    119     , hasCheckedMeta(false)
    120     , skipMetaElement(0)
    121     , isInScriptOrStyleTag(false)
    122     , hasDocDeclaration(false)
    123 {
    124     // Cache the value since we check it lots of times.
    125     isHTMLDocument = doc->isHTMLDocument();
    126 }
    127 
    128 String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(
    129     const Element* element, SerializeDomParam* param, bool* needSkip)
    130 {
    131     StringBuilder result;
    132 
    133     *needSkip = false;
    134     if (param->isHTMLDocument) {
    135         // Skip the open tag of original META tag which declare charset since we
    136         // have overrided the META which have correct charset declaration after
    137         // serializing open tag of HEAD element.
    138         if (element->hasTagName(HTMLNames::metaTag)) {
    139             const HTMLMetaElement* meta = static_cast<const HTMLMetaElement*>(element);
    140             // Check whether the META tag has declared charset or not.
    141             String equiv = meta->httpEquiv();
    142             if (equalIgnoringCase(equiv, "content-type")) {
    143                 String content = meta->content();
    144                 if (content.length() && content.contains("charset", false)) {
    145                     // Find META tag declared charset, we need to skip it when
    146                     // serializing DOM.
    147                     param->skipMetaElement = element;
    148                     *needSkip = true;
    149                 }
    150             }
    151         } else if (element->hasTagName(HTMLNames::htmlTag)) {
    152             // Check something before processing the open tag of HEAD element.
    153             // First we add doc type declaration if original doc has it.
    154             if (!param->hasDoctype) {
    155                 param->hasDoctype = true;
    156                 result.append(createMarkup(param->doc->doctype()));
    157             }
    158 
    159             // Add MOTW declaration before html tag.
    160             // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
    161             result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->currentFrameURL));
    162         } else if (element->hasTagName(HTMLNames::baseTag)) {
    163             // Comment the BASE tag when serializing dom.
    164             result.append("<!--");
    165         }
    166     } else {
    167         // Write XML declaration.
    168         if (!param->hasDocDeclaration) {
    169             param->hasDocDeclaration = true;
    170             // Get encoding info.
    171             String xmlEncoding = param->doc->xmlEncoding();
    172             if (xmlEncoding.isEmpty())
    173                 xmlEncoding = param->doc->frame()->loader()->encoding();
    174             if (xmlEncoding.isEmpty())
    175                 xmlEncoding = UTF8Encoding().name();
    176             result.append("<?xml version=\"");
    177             result.append(param->doc->xmlVersion());
    178             result.append("\" encoding=\"");
    179             result.append(xmlEncoding);
    180             if (param->doc->xmlStandalone())
    181                 result.append("\" standalone=\"yes");
    182             result.append("\"?>\n");
    183         }
    184         // Add doc type declaration if original doc has it.
    185         if (!param->hasDoctype) {
    186             param->hasDoctype = true;
    187             result.append(createMarkup(param->doc->doctype()));
    188         }
    189     }
    190     return result.toString();
    191 }
    192 
    193 String WebPageSerializerImpl::postActionAfterSerializeOpenTag(
    194     const Element* element, SerializeDomParam* param)
    195 {
    196     StringBuilder result;
    197 
    198     param->hasAddedContentsBeforeEnd = false;
    199     if (!param->isHTMLDocument)
    200         return result.toString();
    201     // Check after processing the open tag of HEAD element
    202     if (!param->hasCheckedMeta
    203         && element->hasTagName(HTMLNames::headTag)) {
    204         param->hasCheckedMeta = true;
    205         // Check meta element. WebKit only pre-parse the first 512 bytes
    206         // of the document. If the whole <HEAD> is larger and meta is the
    207         // end of head part, then this kind of pages aren't decoded correctly
    208         // because of this issue. So when we serialize the DOM, we need to
    209         // make sure the meta will in first child of head tag.
    210         // See http://bugs.webkit.org/show_bug.cgi?id=16621.
    211         // First we generate new content for writing correct META element.
    212         result.append(WebPageSerializer::generateMetaCharsetDeclaration(
    213             String(param->textEncoding.name())));
    214 
    215         param->hasAddedContentsBeforeEnd = true;
    216         // Will search each META which has charset declaration, and skip them all
    217         // in PreActionBeforeSerializeOpenTag.
    218     } else if (element->hasTagName(HTMLNames::scriptTag)
    219                || element->hasTagName(HTMLNames::styleTag)) {
    220         param->isInScriptOrStyleTag = true;
    221     }
    222 
    223     return result.toString();
    224 }
    225 
    226 String WebPageSerializerImpl::preActionBeforeSerializeEndTag(
    227     const Element* element, SerializeDomParam* param, bool* needSkip)
    228 {
    229     String result;
    230 
    231     *needSkip = false;
    232     if (!param->isHTMLDocument)
    233         return result;
    234     // Skip the end tag of original META tag which declare charset.
    235     // Need not to check whether it's META tag since we guarantee
    236     // skipMetaElement is definitely META tag if it's not 0.
    237     if (param->skipMetaElement == element)
    238         *needSkip = true;
    239     else if (element->hasTagName(HTMLNames::scriptTag)
    240              || element->hasTagName(HTMLNames::styleTag)) {
    241         ASSERT(param->isInScriptOrStyleTag);
    242         param->isInScriptOrStyleTag = false;
    243     }
    244 
    245     return result;
    246 }
    247 
    248 // After we finish serializing end tag of a element, we give the target
    249 // element a chance to do some post work to add some additional data.
    250 String WebPageSerializerImpl::postActionAfterSerializeEndTag(
    251     const Element* element, SerializeDomParam* param)
    252 {
    253     StringBuilder result;
    254 
    255     if (!param->isHTMLDocument)
    256         return result.toString();
    257     // Comment the BASE tag when serializing DOM.
    258     if (element->hasTagName(HTMLNames::baseTag)) {
    259         result.append("-->");
    260         // Append a new base tag declaration.
    261         result.append(WebPageSerializer::generateBaseTagDeclaration(
    262             param->doc->baseTarget()));
    263     }
    264 
    265     return result.toString();
    266 }
    267 
    268 void WebPageSerializerImpl::saveHTMLContentToBuffer(
    269     const String& result, SerializeDomParam* param)
    270 {
    271     m_dataBuffer.append(result);
    272     encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,
    273                          param,
    274                          0);
    275 }
    276 
    277 void WebPageSerializerImpl::encodeAndFlushBuffer(
    278     WebPageSerializerClient::PageSerializationStatus status,
    279     SerializeDomParam* param,
    280     bool force)
    281 {
    282     // Data buffer is not full nor do we want to force flush.
    283     if (!force && m_dataBuffer.length() <= dataBufferCapacity)
    284         return;
    285 
    286     String content = m_dataBuffer.toString();
    287     m_dataBuffer.clear();
    288 
    289     // Convert the unicode content to target encoding
    290     CString encodedContent = param->textEncoding.encode(
    291         content.characters(), content.length(), EntitiesForUnencodables);
    292 
    293     // Send result to the client.
    294     m_client->didSerializeDataForFrame(param->currentFrameURL,
    295                                        WebCString(encodedContent.data(), encodedContent.length()),
    296                                        status);
    297 }
    298 
    299 void WebPageSerializerImpl::openTagToString(const Element* element,
    300                                             SerializeDomParam* param)
    301 {
    302     // FIXME: use StringBuilder instead of String.
    303     bool needSkip;
    304     // Do pre action for open tag.
    305     String result = preActionBeforeSerializeOpenTag(element, param, &needSkip);
    306     if (needSkip)
    307         return;
    308     // Add open tag
    309     result += "<" + element->nodeName();
    310     // Go through all attributes and serialize them.
    311     const NamedNodeMap *attrMap = element->attributes(true);
    312     if (attrMap) {
    313         unsigned numAttrs = attrMap->length();
    314         for (unsigned i = 0; i < numAttrs; i++) {
    315             result += " ";
    316             // Add attribute pair
    317             const Attribute *attribute = attrMap->attributeItem(i);
    318             result += attribute->name().toString();
    319             result += "=\"";
    320             if (!attribute->value().isEmpty()) {
    321                 const String& attrValue = attribute->value();
    322 
    323                 // Check whether we need to replace some resource links
    324                 // with local resource paths.
    325                 const QualifiedName& attrName = attribute->name();
    326                 if (elementHasLegalLinkAttribute(element, attrName)) {
    327                     // For links start with "javascript:", we do not change it.
    328                     if (attrValue.startsWith("javascript:", false))
    329                         result += attrValue;
    330                     else {
    331                         // Get the absolute link
    332                         String completeURL = param->doc->completeURL(attrValue);
    333                         // Check whether we have local files for those link.
    334                         if (m_localLinks.contains(completeURL)) {
    335                             if (!m_localDirectoryName.isEmpty())
    336                                 result += "./" + m_localDirectoryName + "/";
    337                             result += m_localLinks.get(completeURL);
    338                         } else
    339                             result += completeURL;
    340                     }
    341                 } else {
    342                     if (param->isHTMLDocument)
    343                         result += m_htmlEntities.convertEntitiesInString(attrValue);
    344                     else
    345                         result += m_xmlEntities.convertEntitiesInString(attrValue);
    346                 }
    347             }
    348             result += "\"";
    349         }
    350     }
    351 
    352     // Do post action for open tag.
    353     String addedContents = postActionAfterSerializeOpenTag(element, param);
    354     // Complete the open tag for element when it has child/children.
    355     if (element->hasChildNodes() || param->hasAddedContentsBeforeEnd)
    356         result += ">";
    357     // Append the added contents generate in  post action of open tag.
    358     result += addedContents;
    359     // Save the result to data buffer.
    360     saveHTMLContentToBuffer(result, param);
    361 }
    362 
    363 // Serialize end tag of an specified element.
    364 void WebPageSerializerImpl::endTagToString(const Element* element,
    365                                            SerializeDomParam* param)
    366 {
    367     bool needSkip;
    368     // Do pre action for end tag.
    369     String result = preActionBeforeSerializeEndTag(element,
    370                                                    param,
    371                                                    &needSkip);
    372     if (needSkip)
    373         return;
    374     // Write end tag when element has child/children.
    375     if (element->hasChildNodes() || param->hasAddedContentsBeforeEnd) {
    376         result += "</";
    377         result += element->nodeName();
    378         result += ">";
    379     } else {
    380         // Check whether we have to write end tag for empty element.
    381         if (param->isHTMLDocument) {
    382             result += ">";
    383             const HTMLElement* htmlElement =
    384             static_cast<const HTMLElement*>(element);
    385             if (htmlElement->endTagRequirement() == TagStatusRequired) {
    386                 // We need to write end tag when it is required.
    387                 result += "</";
    388                 result += element->nodeName();
    389                 result += ">";
    390             }
    391         } else {
    392             // For xml base document.
    393             result += " />";
    394         }
    395     }
    396     // Do post action for end tag.
    397     result += postActionAfterSerializeEndTag(element, param);
    398     // Save the result to data buffer.
    399     saveHTMLContentToBuffer(result, param);
    400 }
    401 
    402 void WebPageSerializerImpl::buildContentForNode(const Node* node,
    403                                                 SerializeDomParam* param)
    404 {
    405     switch (node->nodeType()) {
    406     case Node::ELEMENT_NODE:
    407         // Process open tag of element.
    408         openTagToString(static_cast<const Element*>(node), param);
    409         // Walk through the children nodes and process it.
    410         for (const Node *child = node->firstChild(); child; child = child->nextSibling())
    411             buildContentForNode(child, param);
    412         // Process end tag of element.
    413         endTagToString(static_cast<const Element*>(node), param);
    414         break;
    415     case Node::TEXT_NODE:
    416         saveHTMLContentToBuffer(createMarkup(node), param);
    417         break;
    418     case Node::ATTRIBUTE_NODE:
    419     case Node::DOCUMENT_NODE:
    420     case Node::DOCUMENT_FRAGMENT_NODE:
    421         // Should not exist.
    422         ASSERT_NOT_REACHED();
    423         break;
    424     // Document type node can be in DOM?
    425     case Node::DOCUMENT_TYPE_NODE:
    426         param->hasDoctype = true;
    427     default:
    428         // For other type node, call default action.
    429         saveHTMLContentToBuffer(createMarkup(node), param);
    430         break;
    431     }
    432 }
    433 
    434 WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,
    435                                              bool recursiveSerialization,
    436                                              WebPageSerializerClient* client,
    437                                              const WebVector<WebURL>& links,
    438                                              const WebVector<WebString>& localPaths,
    439                                              const WebString& localDirectoryName)
    440     : m_client(client)
    441     , m_recursiveSerialization(recursiveSerialization)
    442     , m_framesCollected(false)
    443     , m_localDirectoryName(localDirectoryName)
    444     , m_htmlEntities(false)
    445     , m_xmlEntities(true)
    446 {
    447     // Must specify available webframe.
    448     ASSERT(frame);
    449     m_specifiedWebFrameImpl = static_cast<WebFrameImpl*>(frame);
    450     // Make sure we have non 0 client.
    451     ASSERT(client);
    452     // Build local resources map.
    453     ASSERT(links.size() == localPaths.size());
    454     for (size_t i = 0; i < links.size(); i++) {
    455         KURL url = links[i];
    456         ASSERT(!m_localLinks.contains(url.string()));
    457         m_localLinks.set(url.string(), localPaths[i]);
    458     }
    459 
    460     ASSERT(!m_dataBuffer.length());
    461 }
    462 
    463 void WebPageSerializerImpl::collectTargetFrames()
    464 {
    465     ASSERT(!m_framesCollected);
    466     m_framesCollected = true;
    467 
    468     // First, process main frame.
    469     m_frames.append(m_specifiedWebFrameImpl);
    470     // Return now if user only needs to serialize specified frame, not including
    471     // all sub-frames.
    472     if (!m_recursiveSerialization)
    473         return;
    474     // Collect all frames inside the specified frame.
    475     for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
    476         WebFrameImpl* currentFrame = m_frames[i];
    477         // Get current using document.
    478         Document* currentDoc = currentFrame->frame()->document();
    479         // Go through sub-frames.
    480         RefPtr<HTMLAllCollection> all = currentDoc->all();
    481         for (Node* node = all->firstItem(); node; node = all->nextItem()) {
    482             if (!node->isHTMLElement())
    483                 continue;
    484             Element* element = static_cast<Element*>(node);
    485             WebFrameImpl* webFrame =
    486                 WebFrameImpl::fromFrameOwnerElement(element);
    487             if (webFrame)
    488                 m_frames.append(webFrame);
    489         }
    490     }
    491 }
    492 
    493 bool WebPageSerializerImpl::serialize()
    494 {
    495     // Collect target frames.
    496     if (!m_framesCollected)
    497         collectTargetFrames();
    498     bool didSerialization = false;
    499     // Get KURL for main frame.
    500     KURL mainPageURL = m_specifiedWebFrameImpl->frame()->loader()->url();
    501 
    502     // Go through all frames for serializing DOM for whole page, include
    503     // sub-frames.
    504     for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
    505         // Get current serializing frame.
    506         WebFrameImpl* currentFrame = m_frames[i];
    507         // Get current using document.
    508         Document* currentDoc = currentFrame->frame()->document();
    509         // Get current frame's URL.
    510         const KURL& currentFrameURL = currentFrame->frame()->loader()->url();
    511 
    512         // Check whether we have done this document.
    513         if (m_localLinks.contains(currentFrameURL.string())) {
    514             // A new document, we will serialize it.
    515             didSerialization = true;
    516             // Get target encoding for current document.
    517             String encoding = currentFrame->frame()->loader()->encoding();
    518             // Create the text encoding object with target encoding.
    519             TextEncoding textEncoding(encoding);
    520             // Construct serialize parameter for late processing document.
    521             SerializeDomParam param(currentFrameURL,
    522                                     encoding.length() ? textEncoding : UTF8Encoding(),
    523                                     currentDoc,
    524                                     currentFrameURL == mainPageURL ? m_localDirectoryName : "");
    525 
    526             // Process current document.
    527             Element* rootElement = currentDoc->documentElement();
    528             if (rootElement)
    529                 buildContentForNode(rootElement, &param);
    530 
    531             // Flush the remainder data and finish serializing current frame.
    532             encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished,
    533                                  &param,
    534                                  1);
    535         }
    536     }
    537 
    538     // We have done call frames, so we send message to embedder to tell it that
    539     // frames are finished serializing.
    540     ASSERT(!m_dataBuffer.length());
    541     m_client->didSerializeDataForFrame(KURL(),
    542                                        WebCString("", 0),
    543                                        WebPageSerializerClient::AllFramesAreFinished);
    544     return didSerialization;
    545 }
    546 
    547 }  // namespace WebKit
    548