Home | History | Annotate | Download | only in web
      1 /*
      2  * Copyright (C) 2009 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *     * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *     * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *     * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 // How we handle the base tag better.
     32 // Current status:
     33 // At now the normal way we use to handling base tag is
     34 // a) For those links which have corresponding local saved files, such as
     35 // savable CSS, JavaScript files, they will be written to relative URLs which
     36 // point to local saved file. Why those links can not be resolved as absolute
     37 // file URLs, because if they are resolved as absolute URLs, after moving the
     38 // file location from one directory to another directory, the file URLs will
     39 // be dead links.
     40 // b) For those links which have not corresponding local saved files, such as
     41 // links in A, AREA tags, they will be resolved as absolute URLs.
     42 // c) We comment all base tags when serialzing DOM for the page.
     43 // FireFox also uses above way to handle base tag.
     44 //
     45 // Problem:
     46 // This way can not handle the following situation:
     47 // the base tag is written by JavaScript.
     48 // For example. The page "www.yahoo.com" use
     49 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL
     50 // of page when loading page. So when saving page as completed-HTML, we assume
     51 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved
     52 // completed-HTML page, then the JavaScript will insert a base tag
     53 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to
     54 // local saved resource files will be resolved as
     55 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved  resource
     56 // files can not be loaded correctly. Also the page will be rendered ugly since
     57 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
     58 // files can not be fetched.
     59 // Now FireFox, IE and WebKit based Browser all have this problem.
     60 //
     61 // Solution:
     62 // My solution is that we comment old base tag and write new base tag:
     63 // <base href="." ...> after the previous commented base tag. In WebKit, it
     64 // always uses the latest "href" attribute of base tag to set document's base
     65 // URL. Based on this behavior, when we encounter a base tag, we comment it and
     66 // write a new base tag <base href="."> after the previous commented base tag.
     67 // The new added base tag can help engine to locate correct base URL for
     68 // correctly loading local saved resource files. Also I think we need to inherit
     69 // the base target value from document object when appending new base tag.
     70 // If there are multiple base tags in original document, we will comment all old
     71 // base tags and append new base tag after each old base tag because we do not
     72 // know those old base tags are original content or added by JavaScript. If
     73 // they are added by JavaScript, it means when loading saved page, the script(s)
     74 // will still insert base tag(s) to DOM, so the new added base tag(s) can
     75 // override the incorrect base URL and make sure we alway load correct local
     76 // saved resource files.
     77 
     78 #include "config.h"
     79 #include "web/WebPageSerializerImpl.h"
     80 
     81 #include "core/HTMLNames.h"
     82 #include "core/dom/Document.h"
     83 #include "core/dom/DocumentType.h"
     84 #include "core/dom/Element.h"
     85 #include "core/editing/markup.h"
     86 #include "core/html/HTMLAllCollection.h"
     87 #include "core/html/HTMLElement.h"
     88 #include "core/html/HTMLFormElement.h"
     89 #include "core/html/HTMLHtmlElement.h"
     90 #include "core/html/HTMLMetaElement.h"
     91 #include "core/loader/DocumentLoader.h"
     92 #include "core/loader/FrameLoader.h"
     93 #include "public/platform/WebVector.h"
     94 #include "web/WebLocalFrameImpl.h"
     95 #include "wtf/text/TextEncoding.h"
     96 
     97 namespace blink {
     98 
     99 // Maximum length of data buffer which is used to temporary save generated
    100 // html content data. This is a soft limit which might be passed if a very large
    101 // contegious string is found in the page.
    102 static const unsigned dataBufferCapacity = 65536;
    103 
    104 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url,
    105                                                             const WTF::TextEncoding& textEncoding,
    106                                                             Document* document,
    107                                                             const String& directoryName)
    108     : url(url)
    109     , textEncoding(textEncoding)
    110     , document(document)
    111     , directoryName(directoryName)
    112     , isHTMLDocument(document->isHTMLDocument())
    113     , haveSeenDocType(false)
    114     , haveAddedCharsetDeclaration(false)
    115     , skipMetaElement(0)
    116     , isInScriptOrStyleTag(false)
    117     , haveAddedXMLProcessingDirective(false)
    118     , haveAddedContentsBeforeEnd(false)
    119 {
    120 }
    121 
    122 String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(
    123     const Element* element, SerializeDomParam* param, bool* needSkip)
    124 {
    125     StringBuilder result;
    126 
    127     *needSkip = false;
    128     if (param->isHTMLDocument) {
    129         // Skip the open tag of original META tag which declare charset since we
    130         // have overrided the META which have correct charset declaration after
    131         // serializing open tag of HEAD element.
    132         ASSERT(element);
    133         if (isHTMLMetaElement(*element)) {
    134             const HTMLMetaElement& meta = toHTMLMetaElement(*element);
    135             // Check whether the META tag has declared charset or not.
    136             String equiv = meta.httpEquiv();
    137             if (equalIgnoringCase(equiv, "content-type")) {
    138                 String content = meta.content();
    139                 if (content.length() && content.contains("charset", false)) {
    140                     // Find META tag declared charset, we need to skip it when
    141                     // serializing DOM.
    142                     param->skipMetaElement = element;
    143                     *needSkip = true;
    144                 }
    145             }
    146         } else if (isHTMLHtmlElement(*element)) {
    147             // Check something before processing the open tag of HEAD element.
    148             // First we add doc type declaration if original document has it.
    149             if (!param->haveSeenDocType) {
    150                 param->haveSeenDocType = true;
    151                 result.append(createMarkup(param->document->doctype()));
    152             }
    153 
    154             // Add MOTW declaration before html tag.
    155             // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
    156             result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url));
    157         } else if (isHTMLBaseElement(*element)) {
    158             // Comment the BASE tag when serializing dom.
    159             result.appendLiteral("<!--");
    160         }
    161     } else {
    162         // Write XML declaration.
    163         if (!param->haveAddedXMLProcessingDirective) {
    164             param->haveAddedXMLProcessingDirective = true;
    165             // Get encoding info.
    166             String xmlEncoding = param->document->xmlEncoding();
    167             if (xmlEncoding.isEmpty())
    168                 xmlEncoding = param->document->encodingName();
    169             if (xmlEncoding.isEmpty())
    170                 xmlEncoding = UTF8Encoding().name();
    171             result.appendLiteral("<?xml version=\"");
    172             result.append(param->document->xmlVersion());
    173             result.appendLiteral("\" encoding=\"");
    174             result.append(xmlEncoding);
    175             if (param->document->xmlStandalone())
    176                 result.appendLiteral("\" standalone=\"yes");
    177             result.appendLiteral("\"?>\n");
    178         }
    179         // Add doc type declaration if original document has it.
    180         if (!param->haveSeenDocType) {
    181             param->haveSeenDocType = true;
    182             result.append(createMarkup(param->document->doctype()));
    183         }
    184     }
    185     return result.toString();
    186 }
    187 
    188 String WebPageSerializerImpl::postActionAfterSerializeOpenTag(
    189     const Element* element, SerializeDomParam* param)
    190 {
    191     StringBuilder result;
    192 
    193     param->haveAddedContentsBeforeEnd = false;
    194     if (!param->isHTMLDocument)
    195         return result.toString();
    196     // Check after processing the open tag of HEAD element
    197     if (!param->haveAddedCharsetDeclaration
    198         && isHTMLHeadElement(*element)) {
    199         param->haveAddedCharsetDeclaration = true;
    200         // Check meta element. WebKit only pre-parse the first 512 bytes
    201         // of the document. If the whole <HEAD> is larger and meta is the
    202         // end of head part, then this kind of pages aren't decoded correctly
    203         // because of this issue. So when we serialize the DOM, we need to
    204         // make sure the meta will in first child of head tag.
    205         // See http://bugs.webkit.org/show_bug.cgi?id=16621.
    206         // First we generate new content for writing correct META element.
    207         result.append(WebPageSerializer::generateMetaCharsetDeclaration(
    208             String(param->textEncoding.name())));
    209 
    210         param->haveAddedContentsBeforeEnd = true;
    211         // Will search each META which has charset declaration, and skip them all
    212         // in PreActionBeforeSerializeOpenTag.
    213     } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) {
    214         param->isInScriptOrStyleTag = true;
    215     }
    216 
    217     return result.toString();
    218 }
    219 
    220 String WebPageSerializerImpl::preActionBeforeSerializeEndTag(
    221     const Element* element, SerializeDomParam* param, bool* needSkip)
    222 {
    223     String result;
    224 
    225     *needSkip = false;
    226     if (!param->isHTMLDocument)
    227         return result;
    228     // Skip the end tag of original META tag which declare charset.
    229     // Need not to check whether it's META tag since we guarantee
    230     // skipMetaElement is definitely META tag if it's not 0.
    231     if (param->skipMetaElement == element) {
    232         *needSkip = true;
    233     } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) {
    234         ASSERT(param->isInScriptOrStyleTag);
    235         param->isInScriptOrStyleTag = false;
    236     }
    237 
    238     return result;
    239 }
    240 
    241 // After we finish serializing end tag of a element, we give the target
    242 // element a chance to do some post work to add some additional data.
    243 String WebPageSerializerImpl::postActionAfterSerializeEndTag(
    244     const Element* element, SerializeDomParam* param)
    245 {
    246     StringBuilder result;
    247 
    248     if (!param->isHTMLDocument)
    249         return result.toString();
    250     // Comment the BASE tag when serializing DOM.
    251     if (isHTMLBaseElement(*element)) {
    252         result.appendLiteral("-->");
    253         // Append a new base tag declaration.
    254         result.append(WebPageSerializer::generateBaseTagDeclaration(
    255             param->document->baseTarget()));
    256     }
    257 
    258     return result.toString();
    259 }
    260 
    261 void WebPageSerializerImpl::saveHTMLContentToBuffer(
    262     const String& result, SerializeDomParam* param)
    263 {
    264     m_dataBuffer.append(result);
    265     encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,
    266                          param,
    267                          DoNotForceFlush);
    268 }
    269 
    270 void WebPageSerializerImpl::encodeAndFlushBuffer(
    271     WebPageSerializerClient::PageSerializationStatus status,
    272     SerializeDomParam* param,
    273     FlushOption flushOption)
    274 {
    275     // Data buffer is not full nor do we want to force flush.
    276     if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity)
    277         return;
    278 
    279     String content = m_dataBuffer.toString();
    280     m_dataBuffer.clear();
    281 
    282     CString encodedContent = param->textEncoding.normalizeAndEncode(content, WTF::EntitiesForUnencodables);
    283 
    284     // Send result to the client.
    285     m_client->didSerializeDataForFrame(param->url,
    286                                        WebCString(encodedContent.data(), encodedContent.length()),
    287                                        status);
    288 }
    289 
    290 void WebPageSerializerImpl::openTagToString(Element* element,
    291                                             SerializeDomParam* param)
    292 {
    293     bool needSkip;
    294     StringBuilder result;
    295     // Do pre action for open tag.
    296     result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip));
    297     if (needSkip)
    298         return;
    299     // Add open tag
    300     result.append('<');
    301     result.append(element->nodeName().lower());
    302     // Go through all attributes and serialize them.
    303     AttributeCollection attributes = element->attributes();
    304     AttributeCollection::iterator end = attributes.end();
    305     for (AttributeCollection::iterator it = attributes.begin(); it != end; ++it) {
    306         result.append(' ');
    307         // Add attribute pair
    308         result.append(it->name().toString());
    309         result.appendLiteral("=\"");
    310         if (!it->value().isEmpty()) {
    311             const String& attrValue = it->value();
    312 
    313             // Check whether we need to replace some resource links
    314             // with local resource paths.
    315             const QualifiedName& attrName = it->name();
    316             if (element->hasLegalLinkAttribute(attrName)) {
    317                 // For links start with "javascript:", we do not change it.
    318                 if (attrValue.startsWith("javascript:", false)) {
    319                     result.append(attrValue);
    320                 } else {
    321                     // Get the absolute link
    322                     WebLocalFrameImpl* subFrame = WebLocalFrameImpl::fromFrameOwnerElement(element);
    323                     String completeURL = subFrame ? subFrame->frame()->document()->url() :
    324                                                     param->document->completeURL(attrValue);
    325                     // Check whether we have local files for those link.
    326                     if (m_localLinks.contains(completeURL)) {
    327                         if (!param->directoryName.isEmpty()) {
    328                             result.appendLiteral("./");
    329                             result.append(param->directoryName);
    330                             result.append('/');
    331                         }
    332                         result.append(m_localLinks.get(completeURL));
    333                     } else {
    334                         result.append(completeURL);
    335                     }
    336                 }
    337             } else {
    338                 if (param->isHTMLDocument)
    339                     result.append(m_htmlEntities.convertEntitiesInString(attrValue));
    340                 else
    341                     result.append(m_xmlEntities.convertEntitiesInString(attrValue));
    342             }
    343         }
    344         result.append('\"');
    345     }
    346 
    347     // Do post action for open tag.
    348     String addedContents = postActionAfterSerializeOpenTag(element, param);
    349     // Complete the open tag for element when it has child/children.
    350     if (element->hasChildren() || param->haveAddedContentsBeforeEnd)
    351         result.append('>');
    352     // Append the added contents generate in  post action of open tag.
    353     result.append(addedContents);
    354     // Save the result to data buffer.
    355     saveHTMLContentToBuffer(result.toString(), param);
    356 }
    357 
    358 // Serialize end tag of an specified element.
    359 void WebPageSerializerImpl::endTagToString(Element* element,
    360                                            SerializeDomParam* param)
    361 {
    362     bool needSkip;
    363     StringBuilder result;
    364     // Do pre action for end tag.
    365     result.append(preActionBeforeSerializeEndTag(element, param, &needSkip));
    366     if (needSkip)
    367         return;
    368     // Write end tag when element has child/children.
    369     if (element->hasChildren() || param->haveAddedContentsBeforeEnd) {
    370         result.appendLiteral("</");
    371         result.append(element->nodeName().lower());
    372         result.append('>');
    373     } else {
    374         // Check whether we have to write end tag for empty element.
    375         if (param->isHTMLDocument) {
    376             result.append('>');
    377             // FIXME: This code is horribly wrong.  WebPageSerializerImpl must die.
    378             if (!element->isHTMLElement() || !toHTMLElement(element)->ieForbidsInsertHTML()) {
    379                 // We need to write end tag when it is required.
    380                 result.appendLiteral("</");
    381                 result.append(element->nodeName().lower());
    382                 result.append('>');
    383             }
    384         } else {
    385             // For xml base document.
    386             result.appendLiteral(" />");
    387         }
    388     }
    389     // Do post action for end tag.
    390     result.append(postActionAfterSerializeEndTag(element, param));
    391     // Save the result to data buffer.
    392     saveHTMLContentToBuffer(result.toString(), param);
    393 }
    394 
    395 void WebPageSerializerImpl::buildContentForNode(Node* node,
    396                                                 SerializeDomParam* param)
    397 {
    398     switch (node->nodeType()) {
    399     case Node::ELEMENT_NODE:
    400         // Process open tag of element.
    401         openTagToString(toElement(node), param);
    402         // Walk through the children nodes and process it.
    403         for (Node *child = node->firstChild(); child; child = child->nextSibling())
    404             buildContentForNode(child, param);
    405         // Process end tag of element.
    406         endTagToString(toElement(node), param);
    407         break;
    408     case Node::TEXT_NODE:
    409         saveHTMLContentToBuffer(createMarkup(node), param);
    410         break;
    411     case Node::ATTRIBUTE_NODE:
    412     case Node::DOCUMENT_NODE:
    413     case Node::DOCUMENT_FRAGMENT_NODE:
    414         // Should not exist.
    415         ASSERT_NOT_REACHED();
    416         break;
    417     // Document type node can be in DOM?
    418     case Node::DOCUMENT_TYPE_NODE:
    419         param->haveSeenDocType = true;
    420     default:
    421         // For other type node, call default action.
    422         saveHTMLContentToBuffer(createMarkup(node), param);
    423         break;
    424     }
    425 }
    426 
    427 WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,
    428                                              bool recursiveSerialization,
    429                                              WebPageSerializerClient* client,
    430                                              const WebVector<WebURL>& links,
    431                                              const WebVector<WebString>& localPaths,
    432                                              const WebString& localDirectoryName)
    433     : m_client(client)
    434     , m_recursiveSerialization(recursiveSerialization)
    435     , m_framesCollected(false)
    436     , m_localDirectoryName(localDirectoryName)
    437     , m_htmlEntities(false)
    438     , m_xmlEntities(true)
    439 {
    440     // Must specify available webframe.
    441     ASSERT(frame);
    442     m_specifiedWebLocalFrameImpl = toWebLocalFrameImpl(frame);
    443     // Make sure we have non 0 client.
    444     ASSERT(client);
    445     // Build local resources map.
    446     ASSERT(links.size() == localPaths.size());
    447     for (size_t i = 0; i < links.size(); i++) {
    448         KURL url = links[i];
    449         ASSERT(!m_localLinks.contains(url.string()));
    450         m_localLinks.set(url.string(), localPaths[i]);
    451     }
    452 
    453     ASSERT(m_dataBuffer.isEmpty());
    454 }
    455 
    456 void WebPageSerializerImpl::collectTargetFrames()
    457 {
    458     ASSERT(!m_framesCollected);
    459     m_framesCollected = true;
    460 
    461     // First, process main frame.
    462     m_frames.append(m_specifiedWebLocalFrameImpl);
    463     // Return now if user only needs to serialize specified frame, not including
    464     // all sub-frames.
    465     if (!m_recursiveSerialization)
    466         return;
    467     // Collect all frames inside the specified frame.
    468     for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
    469         WebLocalFrameImpl* currentFrame = m_frames[i];
    470         // Get current using document.
    471         Document* currentDoc = currentFrame->frame()->document();
    472         // Go through sub-frames.
    473         RefPtrWillBeRawPtr<HTMLAllCollection> all = currentDoc->all();
    474 
    475         for (unsigned i = 0; Element* element = all->item(i); ++i) {
    476             if (!element->isHTMLElement())
    477                 continue;
    478             WebLocalFrameImpl* webFrame =
    479                 WebLocalFrameImpl::fromFrameOwnerElement(element);
    480             if (webFrame)
    481                 m_frames.append(webFrame);
    482         }
    483     }
    484 }
    485 
    486 bool WebPageSerializerImpl::serialize()
    487 {
    488     if (!m_framesCollected)
    489         collectTargetFrames();
    490 
    491     bool didSerialization = false;
    492     KURL mainURL = m_specifiedWebLocalFrameImpl->frame()->document()->url();
    493 
    494     for (unsigned i = 0; i < m_frames.size(); ++i) {
    495         WebLocalFrameImpl* webFrame = m_frames[i];
    496         Document* document = webFrame->frame()->document();
    497         const KURL& url = document->url();
    498 
    499         if (!url.isValid() || !m_localLinks.contains(url.string()))
    500             continue;
    501 
    502         didSerialization = true;
    503 
    504         const WTF::TextEncoding& textEncoding = document->encoding().isValid() ? document->encoding() : UTF8Encoding();
    505         String directoryName = url == mainURL ? m_localDirectoryName : "";
    506 
    507         SerializeDomParam param(url, textEncoding, document, directoryName);
    508 
    509         Element* documentElement = document->documentElement();
    510         if (documentElement)
    511             buildContentForNode(documentElement, &param);
    512 
    513         encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, &param, ForceFlush);
    514     }
    515 
    516     ASSERT(m_dataBuffer.isEmpty());
    517     m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished);
    518     return didSerialization;
    519 }
    520 
    521 }  // namespace blink
    522