Home | History | Annotate | Download | only in web
      1 /*
      2  * Copyright (C) 2009 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *     * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *     * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *     * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 // How we handle the base tag better.
     32 // Current status:
     33 // At now the normal way we use to handling base tag is
     34 // a) For those links which have corresponding local saved files, such as
     35 // savable CSS, JavaScript files, they will be written to relative URLs which
     36 // point to local saved file. Why those links can not be resolved as absolute
     37 // file URLs, because if they are resolved as absolute URLs, after moving the
     38 // file location from one directory to another directory, the file URLs will
     39 // be dead links.
     40 // b) For those links which have not corresponding local saved files, such as
     41 // links in A, AREA tags, they will be resolved as absolute URLs.
     42 // c) We comment all base tags when serialzing DOM for the page.
     43 // FireFox also uses above way to handle base tag.
     44 //
     45 // Problem:
     46 // This way can not handle the following situation:
     47 // the base tag is written by JavaScript.
     48 // For example. The page "www.yahoo.com" use
     49 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL
     50 // of page when loading page. So when saving page as completed-HTML, we assume
     51 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved
     52 // completed-HTML page, then the JavaScript will insert a base tag
     53 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to
     54 // local saved resource files will be resolved as
     55 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved  resource
     56 // files can not be loaded correctly. Also the page will be rendered ugly since
     57 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
     58 // files can not be fetched.
     59 // Now FireFox, IE and WebKit based Browser all have this problem.
     60 //
     61 // Solution:
     62 // My solution is that we comment old base tag and write new base tag:
     63 // <base href="." ...> after the previous commented base tag. In WebKit, it
     64 // always uses the latest "href" attribute of base tag to set document's base
     65 // URL. Based on this behavior, when we encounter a base tag, we comment it and
     66 // write a new base tag <base href="."> after the previous commented base tag.
     67 // The new added base tag can help engine to locate correct base URL for
     68 // correctly loading local saved resource files. Also I think we need to inherit
     69 // the base target value from document object when appending new base tag.
     70 // If there are multiple base tags in original document, we will comment all old
     71 // base tags and append new base tag after each old base tag because we do not
     72 // know those old base tags are original content or added by JavaScript. If
     73 // they are added by JavaScript, it means when loading saved page, the script(s)
     74 // will still insert base tag(s) to DOM, so the new added base tag(s) can
     75 // override the incorrect base URL and make sure we alway load correct local
     76 // saved resource files.
     77 
     78 #include "config.h"
     79 #include "web/WebPageSerializerImpl.h"
     80 
     81 #include "core/HTMLNames.h"
     82 #include "core/dom/Document.h"
     83 #include "core/dom/DocumentType.h"
     84 #include "core/dom/Element.h"
     85 #include "core/editing/markup.h"
     86 #include "core/html/HTMLAllCollection.h"
     87 #include "core/html/HTMLElement.h"
     88 #include "core/html/HTMLFormElement.h"
     89 #include "core/html/HTMLHtmlElement.h"
     90 #include "core/html/HTMLMetaElement.h"
     91 #include "core/loader/DocumentLoader.h"
     92 #include "core/loader/FrameLoader.h"
     93 #include "public/platform/WebVector.h"
     94 #include "web/WebLocalFrameImpl.h"
     95 #include "wtf/text/TextEncoding.h"
     96 
     97 using namespace WebCore;
     98 
     99 namespace blink {
    100 
    101 // Maximum length of data buffer which is used to temporary save generated
    102 // html content data. This is a soft limit which might be passed if a very large
    103 // contegious string is found in the page.
    104 static const unsigned dataBufferCapacity = 65536;
    105 
    106 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url,
    107                                                             const WTF::TextEncoding& textEncoding,
    108                                                             Document* document,
    109                                                             const String& directoryName)
    110     : url(url)
    111     , textEncoding(textEncoding)
    112     , document(document)
    113     , directoryName(directoryName)
    114     , isHTMLDocument(document->isHTMLDocument())
    115     , haveSeenDocType(false)
    116     , haveAddedCharsetDeclaration(false)
    117     , skipMetaElement(0)
    118     , isInScriptOrStyleTag(false)
    119     , haveAddedXMLProcessingDirective(false)
    120     , haveAddedContentsBeforeEnd(false)
    121 {
    122 }
    123 
    124 String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(
    125     const Element* element, SerializeDomParam* param, bool* needSkip)
    126 {
    127     StringBuilder result;
    128 
    129     *needSkip = false;
    130     if (param->isHTMLDocument) {
    131         // Skip the open tag of original META tag which declare charset since we
    132         // have overrided the META which have correct charset declaration after
    133         // serializing open tag of HEAD element.
    134         ASSERT(element);
    135         if (isHTMLMetaElement(*element)) {
    136             const HTMLMetaElement& meta = toHTMLMetaElement(*element);
    137             // Check whether the META tag has declared charset or not.
    138             String equiv = meta.httpEquiv();
    139             if (equalIgnoringCase(equiv, "content-type")) {
    140                 String content = meta.content();
    141                 if (content.length() && content.contains("charset", false)) {
    142                     // Find META tag declared charset, we need to skip it when
    143                     // serializing DOM.
    144                     param->skipMetaElement = element;
    145                     *needSkip = true;
    146                 }
    147             }
    148         } else if (isHTMLHtmlElement(*element)) {
    149             // Check something before processing the open tag of HEAD element.
    150             // First we add doc type declaration if original document has it.
    151             if (!param->haveSeenDocType) {
    152                 param->haveSeenDocType = true;
    153                 result.append(createMarkup(param->document->doctype()));
    154             }
    155 
    156             // Add MOTW declaration before html tag.
    157             // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
    158             result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url));
    159         } else if (isHTMLBaseElement(*element)) {
    160             // Comment the BASE tag when serializing dom.
    161             result.append("<!--");
    162         }
    163     } else {
    164         // Write XML declaration.
    165         if (!param->haveAddedXMLProcessingDirective) {
    166             param->haveAddedXMLProcessingDirective = true;
    167             // Get encoding info.
    168             String xmlEncoding = param->document->xmlEncoding();
    169             if (xmlEncoding.isEmpty())
    170                 xmlEncoding = param->document->encodingName();
    171             if (xmlEncoding.isEmpty())
    172                 xmlEncoding = UTF8Encoding().name();
    173             result.append("<?xml version=\"");
    174             result.append(param->document->xmlVersion());
    175             result.append("\" encoding=\"");
    176             result.append(xmlEncoding);
    177             if (param->document->xmlStandalone())
    178                 result.append("\" standalone=\"yes");
    179             result.append("\"?>\n");
    180         }
    181         // Add doc type declaration if original document has it.
    182         if (!param->haveSeenDocType) {
    183             param->haveSeenDocType = true;
    184             result.append(createMarkup(param->document->doctype()));
    185         }
    186     }
    187     return result.toString();
    188 }
    189 
    190 String WebPageSerializerImpl::postActionAfterSerializeOpenTag(
    191     const Element* element, SerializeDomParam* param)
    192 {
    193     StringBuilder result;
    194 
    195     param->haveAddedContentsBeforeEnd = false;
    196     if (!param->isHTMLDocument)
    197         return result.toString();
    198     // Check after processing the open tag of HEAD element
    199     if (!param->haveAddedCharsetDeclaration
    200         && isHTMLHeadElement(*element)) {
    201         param->haveAddedCharsetDeclaration = true;
    202         // Check meta element. WebKit only pre-parse the first 512 bytes
    203         // of the document. If the whole <HEAD> is larger and meta is the
    204         // end of head part, then this kind of pages aren't decoded correctly
    205         // because of this issue. So when we serialize the DOM, we need to
    206         // make sure the meta will in first child of head tag.
    207         // See http://bugs.webkit.org/show_bug.cgi?id=16621.
    208         // First we generate new content for writing correct META element.
    209         result.append(WebPageSerializer::generateMetaCharsetDeclaration(
    210             String(param->textEncoding.name())));
    211 
    212         param->haveAddedContentsBeforeEnd = true;
    213         // Will search each META which has charset declaration, and skip them all
    214         // in PreActionBeforeSerializeOpenTag.
    215     } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) {
    216         param->isInScriptOrStyleTag = true;
    217     }
    218 
    219     return result.toString();
    220 }
    221 
    222 String WebPageSerializerImpl::preActionBeforeSerializeEndTag(
    223     const Element* element, SerializeDomParam* param, bool* needSkip)
    224 {
    225     String result;
    226 
    227     *needSkip = false;
    228     if (!param->isHTMLDocument)
    229         return result;
    230     // Skip the end tag of original META tag which declare charset.
    231     // Need not to check whether it's META tag since we guarantee
    232     // skipMetaElement is definitely META tag if it's not 0.
    233     if (param->skipMetaElement == element) {
    234         *needSkip = true;
    235     } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) {
    236         ASSERT(param->isInScriptOrStyleTag);
    237         param->isInScriptOrStyleTag = false;
    238     }
    239 
    240     return result;
    241 }
    242 
    243 // After we finish serializing end tag of a element, we give the target
    244 // element a chance to do some post work to add some additional data.
    245 String WebPageSerializerImpl::postActionAfterSerializeEndTag(
    246     const Element* element, SerializeDomParam* param)
    247 {
    248     StringBuilder result;
    249 
    250     if (!param->isHTMLDocument)
    251         return result.toString();
    252     // Comment the BASE tag when serializing DOM.
    253     if (isHTMLBaseElement(*element)) {
    254         result.append("-->");
    255         // Append a new base tag declaration.
    256         result.append(WebPageSerializer::generateBaseTagDeclaration(
    257             param->document->baseTarget()));
    258     }
    259 
    260     return result.toString();
    261 }
    262 
    263 void WebPageSerializerImpl::saveHTMLContentToBuffer(
    264     const String& result, SerializeDomParam* param)
    265 {
    266     m_dataBuffer.append(result);
    267     encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,
    268                          param,
    269                          DoNotForceFlush);
    270 }
    271 
    272 void WebPageSerializerImpl::encodeAndFlushBuffer(
    273     WebPageSerializerClient::PageSerializationStatus status,
    274     SerializeDomParam* param,
    275     FlushOption flushOption)
    276 {
    277     // Data buffer is not full nor do we want to force flush.
    278     if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity)
    279         return;
    280 
    281     String content = m_dataBuffer.toString();
    282     m_dataBuffer.clear();
    283 
    284     CString encodedContent = param->textEncoding.normalizeAndEncode(content, WTF::EntitiesForUnencodables);
    285 
    286     // Send result to the client.
    287     m_client->didSerializeDataForFrame(param->url,
    288                                        WebCString(encodedContent.data(), encodedContent.length()),
    289                                        status);
    290 }
    291 
    292 void WebPageSerializerImpl::openTagToString(Element* element,
    293                                             SerializeDomParam* param)
    294 {
    295     bool needSkip;
    296     StringBuilder result;
    297     // Do pre action for open tag.
    298     result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip));
    299     if (needSkip)
    300         return;
    301     // Add open tag
    302     result.append('<');
    303     result.append(element->nodeName().lower());
    304     // Go through all attributes and serialize them.
    305     if (element->hasAttributes()) {
    306         AttributeCollection attributes = element->attributes();
    307         AttributeCollection::const_iterator end = attributes.end();
    308         for (AttributeCollection::const_iterator it = attributes.begin(); it != end; ++it) {
    309             result.append(' ');
    310             // Add attribute pair
    311             result.append(it->name().toString());
    312             result.appendLiteral("=\"");
    313             if (!it->value().isEmpty()) {
    314                 const String& attrValue = it->value();
    315 
    316                 // Check whether we need to replace some resource links
    317                 // with local resource paths.
    318                 const QualifiedName& attrName = it->name();
    319                 if (element->hasLegalLinkAttribute(attrName)) {
    320                     // For links start with "javascript:", we do not change it.
    321                     if (attrValue.startsWith("javascript:", false))
    322                         result.append(attrValue);
    323                     else {
    324                         // Get the absolute link
    325                         WebLocalFrameImpl* subFrame = WebLocalFrameImpl::fromFrameOwnerElement(element);
    326                         String completeURL = subFrame ? subFrame->frame()->document()->url() :
    327                                                         param->document->completeURL(attrValue);
    328                         // Check whether we have local files for those link.
    329                         if (m_localLinks.contains(completeURL)) {
    330                             if (!param->directoryName.isEmpty()) {
    331                                 result.appendLiteral("./");
    332                                 result.append(param->directoryName);
    333                                 result.append('/');
    334                             }
    335                             result.append(m_localLinks.get(completeURL));
    336                         } else
    337                             result.append(completeURL);
    338                     }
    339                 } else {
    340                     if (param->isHTMLDocument)
    341                         result.append(m_htmlEntities.convertEntitiesInString(attrValue));
    342                     else
    343                         result.append(m_xmlEntities.convertEntitiesInString(attrValue));
    344                 }
    345             }
    346             result.append('\"');
    347         }
    348     }
    349 
    350     // Do post action for open tag.
    351     String addedContents = postActionAfterSerializeOpenTag(element, param);
    352     // Complete the open tag for element when it has child/children.
    353     if (element->hasChildren() || param->haveAddedContentsBeforeEnd)
    354         result.append('>');
    355     // Append the added contents generate in  post action of open tag.
    356     result.append(addedContents);
    357     // Save the result to data buffer.
    358     saveHTMLContentToBuffer(result.toString(), param);
    359 }
    360 
    361 // Serialize end tag of an specified element.
    362 void WebPageSerializerImpl::endTagToString(Element* element,
    363                                            SerializeDomParam* param)
    364 {
    365     bool needSkip;
    366     StringBuilder result;
    367     // Do pre action for end tag.
    368     result.append(preActionBeforeSerializeEndTag(element, param, &needSkip));
    369     if (needSkip)
    370         return;
    371     // Write end tag when element has child/children.
    372     if (element->hasChildren() || param->haveAddedContentsBeforeEnd) {
    373         result.appendLiteral("</");
    374         result.append(element->nodeName().lower());
    375         result.append('>');
    376     } else {
    377         // Check whether we have to write end tag for empty element.
    378         if (param->isHTMLDocument) {
    379             result.append('>');
    380             // FIXME: This code is horribly wrong.  WebPageSerializerImpl must die.
    381             if (!element->isHTMLElement() || !toHTMLElement(element)->ieForbidsInsertHTML()) {
    382                 // We need to write end tag when it is required.
    383                 result.appendLiteral("</");
    384                 result.append(element->nodeName().lower());
    385                 result.append('>');
    386             }
    387         } else {
    388             // For xml base document.
    389             result.appendLiteral(" />");
    390         }
    391     }
    392     // Do post action for end tag.
    393     result.append(postActionAfterSerializeEndTag(element, param));
    394     // Save the result to data buffer.
    395     saveHTMLContentToBuffer(result.toString(), param);
    396 }
    397 
    398 void WebPageSerializerImpl::buildContentForNode(Node* node,
    399                                                 SerializeDomParam* param)
    400 {
    401     switch (node->nodeType()) {
    402     case Node::ELEMENT_NODE:
    403         // Process open tag of element.
    404         openTagToString(toElement(node), param);
    405         // Walk through the children nodes and process it.
    406         for (Node *child = node->firstChild(); child; child = child->nextSibling())
    407             buildContentForNode(child, param);
    408         // Process end tag of element.
    409         endTagToString(toElement(node), param);
    410         break;
    411     case Node::TEXT_NODE:
    412         saveHTMLContentToBuffer(createMarkup(node), param);
    413         break;
    414     case Node::ATTRIBUTE_NODE:
    415     case Node::DOCUMENT_NODE:
    416     case Node::DOCUMENT_FRAGMENT_NODE:
    417         // Should not exist.
    418         ASSERT_NOT_REACHED();
    419         break;
    420     // Document type node can be in DOM?
    421     case Node::DOCUMENT_TYPE_NODE:
    422         param->haveSeenDocType = true;
    423     default:
    424         // For other type node, call default action.
    425         saveHTMLContentToBuffer(createMarkup(node), param);
    426         break;
    427     }
    428 }
    429 
    430 WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,
    431                                              bool recursiveSerialization,
    432                                              WebPageSerializerClient* client,
    433                                              const WebVector<WebURL>& links,
    434                                              const WebVector<WebString>& localPaths,
    435                                              const WebString& localDirectoryName)
    436     : m_client(client)
    437     , m_recursiveSerialization(recursiveSerialization)
    438     , m_framesCollected(false)
    439     , m_localDirectoryName(localDirectoryName)
    440     , m_htmlEntities(false)
    441     , m_xmlEntities(true)
    442 {
    443     // Must specify available webframe.
    444     ASSERT(frame);
    445     m_specifiedWebLocalFrameImpl = toWebLocalFrameImpl(frame);
    446     // Make sure we have non 0 client.
    447     ASSERT(client);
    448     // Build local resources map.
    449     ASSERT(links.size() == localPaths.size());
    450     for (size_t i = 0; i < links.size(); i++) {
    451         KURL url = links[i];
    452         ASSERT(!m_localLinks.contains(url.string()));
    453         m_localLinks.set(url.string(), localPaths[i]);
    454     }
    455 
    456     ASSERT(m_dataBuffer.isEmpty());
    457 }
    458 
    459 void WebPageSerializerImpl::collectTargetFrames()
    460 {
    461     ASSERT(!m_framesCollected);
    462     m_framesCollected = true;
    463 
    464     // First, process main frame.
    465     m_frames.append(m_specifiedWebLocalFrameImpl);
    466     // Return now if user only needs to serialize specified frame, not including
    467     // all sub-frames.
    468     if (!m_recursiveSerialization)
    469         return;
    470     // Collect all frames inside the specified frame.
    471     for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
    472         WebLocalFrameImpl* currentFrame = m_frames[i];
    473         // Get current using document.
    474         Document* currentDoc = currentFrame->frame()->document();
    475         // Go through sub-frames.
    476         RefPtrWillBeRawPtr<HTMLAllCollection> all = currentDoc->all();
    477 
    478         for (unsigned i = 0; Element* element = all->item(i); ++i) {
    479             if (!element->isHTMLElement())
    480                 continue;
    481             WebLocalFrameImpl* webFrame =
    482                 WebLocalFrameImpl::fromFrameOwnerElement(element);
    483             if (webFrame)
    484                 m_frames.append(webFrame);
    485         }
    486     }
    487 }
    488 
    489 bool WebPageSerializerImpl::serialize()
    490 {
    491     if (!m_framesCollected)
    492         collectTargetFrames();
    493 
    494     bool didSerialization = false;
    495     KURL mainURL = m_specifiedWebLocalFrameImpl->frame()->document()->url();
    496 
    497     for (unsigned i = 0; i < m_frames.size(); ++i) {
    498         WebLocalFrameImpl* webFrame = m_frames[i];
    499         Document* document = webFrame->frame()->document();
    500         const KURL& url = document->url();
    501 
    502         if (!url.isValid() || !m_localLinks.contains(url.string()))
    503             continue;
    504 
    505         didSerialization = true;
    506 
    507         const WTF::TextEncoding& textEncoding = document->encoding().isValid() ? document->encoding() : UTF8Encoding();
    508         String directoryName = url == mainURL ? m_localDirectoryName : "";
    509 
    510         SerializeDomParam param(url, textEncoding, document, directoryName);
    511 
    512         Element* documentElement = document->documentElement();
    513         if (documentElement)
    514             buildContentForNode(documentElement, &param);
    515 
    516         encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, &param, ForceFlush);
    517     }
    518 
    519     ASSERT(m_dataBuffer.isEmpty());
    520     m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished);
    521     return didSerialization;
    522 }
    523 
    524 }  // namespace blink
    525