Home | History | Annotate | Download | only in renderer
      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "base/bind.h"
      6 #include "base/command_line.h"
      7 #include "base/compiler_specific.h"
      8 #include "base/containers/hash_tables.h"
      9 #include "base/file_util.h"
     10 #include "base/files/file_path.h"
     11 #include "base/strings/string_util.h"
     12 #include "base/strings/utf_string_conversions.h"
     13 #include "content/public/common/content_switches.h"
     14 #include "content/public/renderer/render_view.h"
     15 #include "content/public/renderer/render_view_observer.h"
     16 #include "content/public/test/test_utils.h"
     17 #include "content/renderer/savable_resources.h"
     18 #include "content/shell/browser/shell.h"
     19 #include "content/test/content_browser_test.h"
     20 #include "content/test/content_browser_test_utils.h"
     21 #include "net/base/net_util.h"
     22 #include "net/url_request/url_request_context.h"
     23 #include "third_party/WebKit/public/platform/WebCString.h"
     24 #include "third_party/WebKit/public/platform/WebData.h"
     25 #include "third_party/WebKit/public/platform/WebString.h"
     26 #include "third_party/WebKit/public/platform/WebURL.h"
     27 #include "third_party/WebKit/public/platform/WebVector.h"
     28 #include "third_party/WebKit/public/web/WebDocument.h"
     29 #include "third_party/WebKit/public/web/WebElement.h"
     30 #include "third_party/WebKit/public/web/WebFrame.h"
     31 #include "third_party/WebKit/public/web/WebNode.h"
     32 #include "third_party/WebKit/public/web/WebNodeCollection.h"
     33 #include "third_party/WebKit/public/web/WebNodeList.h"
     34 #include "third_party/WebKit/public/web/WebPageSerializer.h"
     35 #include "third_party/WebKit/public/web/WebPageSerializerClient.h"
     36 #include "third_party/WebKit/public/web/WebView.h"
     37 
     38 using blink::WebCString;
     39 using blink::WebData;
     40 using blink::WebDocument;
     41 using blink::WebElement;
     42 using blink::WebFrame;
     43 using blink::WebNode;
     44 using blink::WebNodeCollection;
     45 using blink::WebNodeList;
     46 using blink::WebPageSerializer;
     47 using blink::WebPageSerializerClient;
     48 using blink::WebNode;
     49 using blink::WebString;
     50 using blink::WebURL;
     51 using blink::WebView;
     52 using blink::WebVector;
     53 
     54 namespace content {
     55 
     56 // Iterate recursively over sub-frames to find one with with a given url.
     57 WebFrame* FindSubFrameByURL(WebView* web_view, const GURL& url) {
     58   if (!web_view->mainFrame())
     59     return NULL;
     60 
     61   std::vector<WebFrame*> stack;
     62   stack.push_back(web_view->mainFrame());
     63 
     64   while (!stack.empty()) {
     65     WebFrame* current_frame = stack.back();
     66     stack.pop_back();
     67     if (GURL(current_frame->document().url()) == url)
     68       return current_frame;
     69     WebNodeCollection all = current_frame->document().all();
     70     for (WebNode node = all.firstItem();
     71          !node.isNull(); node = all.nextItem()) {
     72       if (!node.isElementNode())
     73         continue;
     74       // Check frame tag and iframe tag
     75       WebElement element = node.to<WebElement>();
     76       if (!element.hasTagName("frame") && !element.hasTagName("iframe"))
     77         continue;
     78       WebFrame* sub_frame = WebFrame::fromFrameOwnerElement(element);
     79       if (sub_frame)
     80         stack.push_back(sub_frame);
     81     }
     82   }
     83   return NULL;
     84 }
     85 
     86 // Helper function that test whether the first node in the doc is a doc type
     87 // node.
     88 bool HasDocType(const WebDocument& doc) {
     89   WebNode node = doc.firstChild();
     90   if (node.isNull())
     91     return false;
     92   return node.nodeType() == WebNode::DocumentTypeNode;
     93 }
     94 
     95   // Helper function for checking whether input node is META tag. Return true
     96 // means it is META element, otherwise return false. The parameter charset_info
     97 // return actual charset info if the META tag has charset declaration.
     98 bool IsMetaElement(const WebNode& node, std::string& charset_info) {
     99   if (!node.isElementNode())
    100     return false;
    101   const WebElement meta = node.toConst<WebElement>();
    102   if (!meta.hasTagName("meta"))
    103     return false;
    104   charset_info.erase(0, charset_info.length());
    105   // Check the META charset declaration.
    106   WebString httpEquiv = meta.getAttribute("http-equiv");
    107   if (LowerCaseEqualsASCII(httpEquiv, "content-type")) {
    108     std::string content = meta.getAttribute("content").utf8();
    109     int pos = content.find("charset", 0);
    110     if (pos > -1) {
    111       // Add a dummy charset declaration to charset_info, which indicates this
    112       // META tag has charset declaration although we do not get correct value
    113       // yet.
    114       charset_info.append("has-charset-declaration");
    115       int remaining_length = content.length() - pos - 7;
    116       if (!remaining_length)
    117         return true;
    118       int start_pos = pos + 7;
    119       // Find "=" symbol.
    120       while (remaining_length--)
    121         if (content[start_pos++] == L'=')
    122           break;
    123       // Skip beginning space.
    124       while (remaining_length) {
    125         if (content[start_pos] > 0x0020)
    126           break;
    127         ++start_pos;
    128         --remaining_length;
    129       }
    130       if (!remaining_length)
    131         return true;
    132       int end_pos = start_pos;
    133       // Now we find out the start point of charset info. Search the end point.
    134       while (remaining_length--) {
    135         if (content[end_pos] <= 0x0020 || content[end_pos] == L';')
    136           break;
    137         ++end_pos;
    138       }
    139       // Get actual charset info.
    140       charset_info = content.substr(start_pos, end_pos - start_pos);
    141       return true;
    142     }
    143   }
    144   return true;
    145 }
    146 
    147 class LoadObserver : public RenderViewObserver {
    148  public:
    149   LoadObserver(RenderView* render_view, const base::Closure& quit_closure)
    150       : RenderViewObserver(render_view),
    151         quit_closure_(quit_closure) {}
    152 
    153   virtual void DidFinishLoad(blink::WebFrame* frame) OVERRIDE {
    154     if (frame == render_view()->GetWebView()->mainFrame())
    155       quit_closure_.Run();
    156   }
    157 
    158  private:
    159   base::Closure quit_closure_;
    160 };
    161 
    162 class DomSerializerTests : public ContentBrowserTest,
    163                            public WebPageSerializerClient {
    164  public:
    165   DomSerializerTests()
    166     : serialized_(false),
    167       local_directory_name_(FILE_PATH_LITERAL("./dummy_files/")) {}
    168 
    169   virtual void SetUpCommandLine(CommandLine* command_line) OVERRIDE {
    170     command_line->AppendSwitch(switches::kSingleProcess);
    171 #if defined(OS_WIN) && defined(USE_AURA)
    172     // Don't want to try to create a GPU process.
    173     command_line->AppendSwitch(switches::kDisableAcceleratedCompositing);
    174 #endif
    175   }
    176 
    177   // DomSerializerDelegate.
    178   virtual void didSerializeDataForFrame(const WebURL& frame_web_url,
    179                                         const WebCString& data,
    180                                         PageSerializationStatus status) {
    181 
    182     GURL frame_url(frame_web_url);
    183     // If the all frames are finished saving, check all finish status
    184     if (status == WebPageSerializerClient::AllFramesAreFinished) {
    185       SerializationFinishStatusMap::iterator it =
    186           serialization_finish_status_.begin();
    187       for (; it != serialization_finish_status_.end(); ++it)
    188         ASSERT_TRUE(it->second);
    189       serialized_ = true;
    190       return;
    191     }
    192 
    193     // Check finish status of current frame.
    194     SerializationFinishStatusMap::iterator it =
    195         serialization_finish_status_.find(frame_url.spec());
    196     // New frame, set initial status as false.
    197     if (it == serialization_finish_status_.end())
    198       serialization_finish_status_[frame_url.spec()] = false;
    199 
    200     it = serialization_finish_status_.find(frame_url.spec());
    201     ASSERT_TRUE(it != serialization_finish_status_.end());
    202     // In process frame, finish status should be false.
    203     ASSERT_FALSE(it->second);
    204 
    205     // Add data to corresponding frame's content.
    206     serialized_frame_map_[frame_url.spec()] += data.data();
    207 
    208     // Current frame is completed saving, change the finish status.
    209     if (status == WebPageSerializerClient::CurrentFrameIsFinished)
    210       it->second = true;
    211   }
    212 
    213   bool HasSerializedFrame(const GURL& frame_url) {
    214     return serialized_frame_map_.find(frame_url.spec()) !=
    215            serialized_frame_map_.end();
    216   }
    217 
    218   const std::string& GetSerializedContentForFrame(
    219       const GURL& frame_url) {
    220     return serialized_frame_map_[frame_url.spec()];
    221   }
    222 
    223   RenderView* GetRenderView() {
    224     // We could have the test on the UI thread get the WebContent's routing ID,
    225     // but we know this will be the first RV so skip that and just hardcode it.
    226     return RenderView::FromRoutingID(1);
    227   }
    228 
    229   WebView* GetWebView() {
    230     return GetRenderView()->GetWebView();
    231   }
    232 
    233   WebFrame* GetMainFrame() {
    234     return GetWebView()->mainFrame();
    235   }
    236 
    237   // Load web page according to input content and relative URLs within
    238   // the document.
    239   void LoadContents(const std::string& contents,
    240                     const GURL& base_url,
    241                     const WebString encoding_info) {
    242     scoped_refptr<MessageLoopRunner> runner = new MessageLoopRunner;
    243     LoadObserver observer(GetRenderView(), runner->QuitClosure());
    244 
    245     // If input encoding is empty, use UTF-8 as default encoding.
    246     if (encoding_info.isEmpty()) {
    247       GetMainFrame()->loadHTMLString(contents, base_url);
    248     } else {
    249       WebData data(contents.data(), contents.length());
    250 
    251       // Do not use WebFrame.LoadHTMLString because it assumes that input
    252       // html contents use UTF-8 encoding.
    253       // TODO(darin): This should use WebFrame::loadData.
    254       WebFrame* web_frame = GetMainFrame();
    255 
    256       ASSERT_TRUE(web_frame != NULL);
    257 
    258       web_frame->loadData(data, "text/html", encoding_info, base_url);
    259     }
    260 
    261     runner->Run();
    262   }
    263 
    264   // Serialize page DOM according to specific page URL. The parameter
    265   // recursive_serialization indicates whether we will serialize all
    266   // sub-frames.
    267   void SerializeDomForURL(const GURL& page_url,
    268                           bool recursive_serialization) {
    269     // Find corresponding WebFrame according to page_url.
    270     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), page_url);
    271     ASSERT_TRUE(web_frame != NULL);
    272     WebVector<WebURL> links;
    273     links.assign(&page_url, 1);
    274     WebString file_path =
    275         base::FilePath(FILE_PATH_LITERAL("c:\\dummy.htm")).AsUTF16Unsafe();
    276     WebVector<WebString> local_paths;
    277     local_paths.assign(&file_path, 1);
    278     // Start serializing DOM.
    279     bool result = WebPageSerializer::serialize(web_frame,
    280        recursive_serialization,
    281        static_cast<WebPageSerializerClient*>(this),
    282        links,
    283        local_paths,
    284        local_directory_name_.AsUTF16Unsafe());
    285     ASSERT_TRUE(result);
    286     ASSERT_TRUE(serialized_);
    287   }
    288 
    289   void SerializeHTMLDOMWithDocTypeOnRenderer(const GURL& file_url) {
    290     // Make sure original contents have document type.
    291     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    292     ASSERT_TRUE(web_frame != NULL);
    293     WebDocument doc = web_frame->document();
    294     ASSERT_TRUE(HasDocType(doc));
    295     // Do serialization.
    296     SerializeDomForURL(file_url, false);
    297     // Load the serialized contents.
    298     ASSERT_TRUE(HasSerializedFrame(file_url));
    299     const std::string& serialized_contents =
    300         GetSerializedContentForFrame(file_url);
    301     LoadContents(serialized_contents, file_url,
    302                  web_frame->document().encoding());
    303     // Make sure serialized contents still have document type.
    304     web_frame = GetMainFrame();
    305     doc = web_frame->document();
    306     ASSERT_TRUE(HasDocType(doc));
    307   }
    308 
    309   void SerializeHTMLDOMWithoutDocTypeOnRenderer(const GURL& file_url) {
    310     // Make sure original contents do not have document type.
    311     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    312     ASSERT_TRUE(web_frame != NULL);
    313     WebDocument doc = web_frame->document();
    314     ASSERT_TRUE(!HasDocType(doc));
    315     // Do serialization.
    316     SerializeDomForURL(file_url, false);
    317     // Load the serialized contents.
    318     ASSERT_TRUE(HasSerializedFrame(file_url));
    319     const std::string& serialized_contents =
    320         GetSerializedContentForFrame(file_url);
    321     LoadContents(serialized_contents, file_url,
    322                  web_frame->document().encoding());
    323     // Make sure serialized contents do not have document type.
    324     web_frame = GetMainFrame();
    325     doc = web_frame->document();
    326     ASSERT_TRUE(!HasDocType(doc));
    327   }
    328 
    329   void SerializeXMLDocWithBuiltInEntitiesOnRenderer(
    330       const GURL& xml_file_url, const std::string& original_contents) {
    331     // Do serialization.
    332     SerializeDomForURL(xml_file_url, false);
    333     // Compare the serialized contents with original contents.
    334     ASSERT_TRUE(HasSerializedFrame(xml_file_url));
    335     const std::string& serialized_contents =
    336         GetSerializedContentForFrame(xml_file_url);
    337     ASSERT_EQ(original_contents, serialized_contents);
    338   }
    339 
    340   void SerializeHTMLDOMWithAddingMOTWOnRenderer(
    341       const GURL& file_url, const std::string& original_contents) {
    342     // Make sure original contents does not have MOTW;
    343     std::string motw_declaration =
    344        WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
    345     ASSERT_FALSE(motw_declaration.empty());
    346     // The encoding of original contents is ISO-8859-1, so we convert the MOTW
    347     // declaration to ASCII and search whether original contents has it or not.
    348     ASSERT_TRUE(std::string::npos == original_contents.find(motw_declaration));
    349 
    350     // Do serialization.
    351     SerializeDomForURL(file_url, false);
    352     // Make sure the serialized contents have MOTW ;
    353     ASSERT_TRUE(HasSerializedFrame(file_url));
    354     const std::string& serialized_contents =
    355         GetSerializedContentForFrame(file_url);
    356     ASSERT_FALSE(std::string::npos ==
    357         serialized_contents.find(motw_declaration));
    358   }
    359 
    360   void SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer(
    361       const GURL& file_url) {
    362     // Make sure there is no META charset declaration in original document.
    363     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    364     ASSERT_TRUE(web_frame != NULL);
    365     WebDocument doc = web_frame->document();
    366     ASSERT_TRUE(doc.isHTMLDocument());
    367     WebElement head_element = doc.head();
    368     ASSERT_TRUE(!head_element.isNull());
    369     // Go through all children of HEAD element.
    370     for (WebNode child = head_element.firstChild(); !child.isNull();
    371          child = child.nextSibling()) {
    372       std::string charset_info;
    373       if (IsMetaElement(child, charset_info))
    374         ASSERT_TRUE(charset_info.empty());
    375     }
    376     // Do serialization.
    377     SerializeDomForURL(file_url, false);
    378 
    379     // Load the serialized contents.
    380     ASSERT_TRUE(HasSerializedFrame(file_url));
    381     const std::string& serialized_contents =
    382         GetSerializedContentForFrame(file_url);
    383     LoadContents(serialized_contents, file_url,
    384                  web_frame->document().encoding());
    385     // Make sure the first child of HEAD element is META which has charset
    386     // declaration in serialized contents.
    387     web_frame = GetMainFrame();
    388     ASSERT_TRUE(web_frame != NULL);
    389     doc = web_frame->document();
    390     ASSERT_TRUE(doc.isHTMLDocument());
    391     head_element = doc.head();
    392     ASSERT_TRUE(!head_element.isNull());
    393     WebNode meta_node = head_element.firstChild();
    394     ASSERT_TRUE(!meta_node.isNull());
    395     // Get meta charset info.
    396     std::string charset_info2;
    397     ASSERT_TRUE(IsMetaElement(meta_node, charset_info2));
    398     ASSERT_TRUE(!charset_info2.empty());
    399     ASSERT_EQ(charset_info2,
    400               std::string(web_frame->document().encoding().utf8()));
    401 
    402     // Make sure no more additional META tags which have charset declaration.
    403     for (WebNode child = meta_node.nextSibling(); !child.isNull();
    404          child = child.nextSibling()) {
    405       std::string charset_info;
    406       if (IsMetaElement(child, charset_info))
    407         ASSERT_TRUE(charset_info.empty());
    408     }
    409   }
    410 
    411   void SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer(
    412       const GURL& file_url) {
    413     // Make sure there are multiple META charset declarations in original
    414     // document.
    415     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    416     ASSERT_TRUE(web_frame != NULL);
    417     WebDocument doc = web_frame->document();
    418     ASSERT_TRUE(doc.isHTMLDocument());
    419     WebElement head_ele = doc.head();
    420     ASSERT_TRUE(!head_ele.isNull());
    421     // Go through all children of HEAD element.
    422     int charset_declaration_count = 0;
    423     for (WebNode child = head_ele.firstChild(); !child.isNull();
    424          child = child.nextSibling()) {
    425       std::string charset_info;
    426       if (IsMetaElement(child, charset_info) && !charset_info.empty())
    427         charset_declaration_count++;
    428     }
    429     // The original doc has more than META tags which have charset declaration.
    430     ASSERT_TRUE(charset_declaration_count > 1);
    431 
    432     // Do serialization.
    433     SerializeDomForURL(file_url, false);
    434 
    435     // Load the serialized contents.
    436     ASSERT_TRUE(HasSerializedFrame(file_url));
    437     const std::string& serialized_contents =
    438         GetSerializedContentForFrame(file_url);
    439     LoadContents(serialized_contents, file_url,
    440                  web_frame->document().encoding());
    441     // Make sure only first child of HEAD element is META which has charset
    442     // declaration in serialized contents.
    443     web_frame = GetMainFrame();
    444     ASSERT_TRUE(web_frame != NULL);
    445     doc = web_frame->document();
    446     ASSERT_TRUE(doc.isHTMLDocument());
    447     head_ele = doc.head();
    448     ASSERT_TRUE(!head_ele.isNull());
    449     WebNode meta_node = head_ele.firstChild();
    450     ASSERT_TRUE(!meta_node.isNull());
    451     // Get meta charset info.
    452     std::string charset_info2;
    453     ASSERT_TRUE(IsMetaElement(meta_node, charset_info2));
    454     ASSERT_TRUE(!charset_info2.empty());
    455     ASSERT_EQ(charset_info2,
    456               std::string(web_frame->document().encoding().utf8()));
    457 
    458     // Make sure no more additional META tags which have charset declaration.
    459     for (WebNode child = meta_node.nextSibling(); !child.isNull();
    460          child = child.nextSibling()) {
    461       std::string charset_info;
    462       if (IsMetaElement(child, charset_info))
    463         ASSERT_TRUE(charset_info.empty());
    464     }
    465   }
    466 
    467   void SerializeHTMLDOMWithEntitiesInTextOnRenderer() {
    468     base::FilePath page_file_path = GetTestFilePath(
    469         "dom_serializer", "dom_serializer/htmlentities_in_text.htm");
    470     // Get file URL. The URL is dummy URL to identify the following loading
    471     // actions. The test content is in constant:original_contents.
    472     GURL file_url = net::FilePathToFileURL(page_file_path);
    473     ASSERT_TRUE(file_url.SchemeIsFile());
    474     // Test contents.
    475     static const char* const original_contents =
    476         "<html><body>&amp;&lt;&gt;\"\'</body></html>";
    477     // Load the test contents.
    478     LoadContents(original_contents, file_url, WebString());
    479 
    480     // Get BODY's text content in DOM.
    481     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    482     ASSERT_TRUE(web_frame != NULL);
    483     WebDocument doc = web_frame->document();
    484     ASSERT_TRUE(doc.isHTMLDocument());
    485     WebElement body_ele = doc.body();
    486     ASSERT_TRUE(!body_ele.isNull());
    487     WebNode text_node = body_ele.firstChild();
    488     ASSERT_TRUE(text_node.isTextNode());
    489     ASSERT_TRUE(std::string(text_node.createMarkup().utf8()) ==
    490                 "&amp;&lt;&gt;\"\'");
    491     // Do serialization.
    492     SerializeDomForURL(file_url, false);
    493     // Compare the serialized contents with original contents.
    494     ASSERT_TRUE(HasSerializedFrame(file_url));
    495     const std::string& serialized_contents =
    496         GetSerializedContentForFrame(file_url);
    497     // Compare the serialized contents with original contents to make sure
    498     // they are same.
    499     // Because we add MOTW when serializing DOM, so before comparison, we also
    500     // need to add MOTW to original_contents.
    501     std::string original_str =
    502       WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
    503     original_str += original_contents;
    504     // Since WebCore now inserts a new HEAD element if there is no HEAD element
    505     // when creating BODY element. (Please see
    506     // HTMLParser::bodyCreateErrorCheck.) We need to append the HEAD content and
    507     // corresponding META content if we find WebCore-generated HEAD element.
    508     if (!doc.head().isNull()) {
    509       WebString encoding = web_frame->document().encoding();
    510       std::string htmlTag("<html>");
    511       std::string::size_type pos = original_str.find(htmlTag);
    512       ASSERT_NE(std::string::npos, pos);
    513       pos += htmlTag.length();
    514       std::string head_part("<head>");
    515       head_part +=
    516           WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8();
    517       head_part += "</head>";
    518       original_str.insert(pos, head_part);
    519     }
    520     ASSERT_EQ(original_str, serialized_contents);
    521   }
    522 
    523   void SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer() {
    524     base::FilePath page_file_path = GetTestFilePath(
    525         "dom_serializer", "dom_serializer/htmlentities_in_attribute_value.htm");
    526     // Get file URL. The URL is dummy URL to identify the following loading
    527     // actions. The test content is in constant:original_contents.
    528     GURL file_url = net::FilePathToFileURL(page_file_path);
    529     ASSERT_TRUE(file_url.SchemeIsFile());
    530     // Test contents.
    531     static const char* const original_contents =
    532         "<html><body title=\"&amp;&lt;&gt;&quot;&#39;\"></body></html>";
    533     // Load the test contents.
    534     LoadContents(original_contents, file_url, WebString());
    535     // Get value of BODY's title attribute in DOM.
    536     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    537     ASSERT_TRUE(web_frame != NULL);
    538     WebDocument doc = web_frame->document();
    539     ASSERT_TRUE(doc.isHTMLDocument());
    540     WebElement body_ele = doc.body();
    541     ASSERT_TRUE(!body_ele.isNull());
    542     WebString value = body_ele.getAttribute("title");
    543     ASSERT_TRUE(std::string(value.utf8()) == "&<>\"\'");
    544     // Do serialization.
    545     SerializeDomForURL(file_url, false);
    546     // Compare the serialized contents with original contents.
    547     ASSERT_TRUE(HasSerializedFrame(file_url));
    548     const std::string& serialized_contents =
    549         GetSerializedContentForFrame(file_url);
    550     // Compare the serialized contents with original contents to make sure
    551     // they are same.
    552     std::string original_str =
    553         WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
    554     original_str += original_contents;
    555     if (!doc.isNull()) {
    556       WebString encoding = web_frame->document().encoding();
    557       std::string htmlTag("<html>");
    558       std::string::size_type pos = original_str.find(htmlTag);
    559       ASSERT_NE(std::string::npos, pos);
    560       pos += htmlTag.length();
    561       std::string head_part("<head>");
    562       head_part +=
    563           WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8();
    564       head_part += "</head>";
    565       original_str.insert(pos, head_part);
    566     }
    567     ASSERT_EQ(original_str, serialized_contents);
    568   }
    569 
    570   void SerializeHTMLDOMWithNonStandardEntitiesOnRenderer(const GURL& file_url) {
    571     // Get value of BODY's title attribute in DOM.
    572     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    573     WebDocument doc = web_frame->document();
    574     ASSERT_TRUE(doc.isHTMLDocument());
    575     WebElement body_element = doc.body();
    576     // Unescaped string for "&percnt;&nsup;&sup1;&apos;".
    577     static const wchar_t parsed_value[] = {
    578       '%', 0x2285, 0x00b9, '\'', 0
    579     };
    580     WebString value = body_element.getAttribute("title");
    581     ASSERT_TRUE(UTF16ToWide(value) == parsed_value);
    582     ASSERT_TRUE(UTF16ToWide(body_element.innerText()) == parsed_value);
    583 
    584     // Do serialization.
    585     SerializeDomForURL(file_url, false);
    586     // Check the serialized string.
    587     ASSERT_TRUE(HasSerializedFrame(file_url));
    588     const std::string& serialized_contents =
    589         GetSerializedContentForFrame(file_url);
    590     // Confirm that the serialized string has no non-standard HTML entities.
    591     ASSERT_EQ(std::string::npos, serialized_contents.find("&percnt;"));
    592     ASSERT_EQ(std::string::npos, serialized_contents.find("&nsup;"));
    593     ASSERT_EQ(std::string::npos, serialized_contents.find("&sup1;"));
    594     ASSERT_EQ(std::string::npos, serialized_contents.find("&apos;"));
    595   }
    596 
    597   void SerializeHTMLDOMWithBaseTagOnRenderer(const GURL& file_url,
    598                                              const GURL& path_dir_url) {
    599     // There are total 2 available base tags in this test file.
    600     const int kTotalBaseTagCountInTestFile = 2;
    601 
    602     // Since for this test, we assume there is no savable sub-resource links for
    603     // this test file, also all links are relative URLs in this test file, so we
    604     // need to check those relative URLs and make sure document has BASE tag.
    605     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    606     ASSERT_TRUE(web_frame != NULL);
    607     WebDocument doc = web_frame->document();
    608     ASSERT_TRUE(doc.isHTMLDocument());
    609     // Go through all descent nodes.
    610     WebNodeCollection all = doc.all();
    611     int original_base_tag_count = 0;
    612     for (WebNode node = all.firstItem(); !node.isNull();
    613          node = all.nextItem()) {
    614       if (!node.isElementNode())
    615         continue;
    616       WebElement element = node.to<WebElement>();
    617       if (element.hasTagName("base")) {
    618         original_base_tag_count++;
    619       } else {
    620         // Get link.
    621         WebString value = GetSubResourceLinkFromElement(element);
    622         if (value.isNull() && element.hasTagName("a")) {
    623           value = element.getAttribute("href");
    624           if (value.isEmpty())
    625             value = WebString();
    626         }
    627         // Each link is relative link.
    628         if (!value.isNull()) {
    629           GURL link(value.utf8());
    630           ASSERT_TRUE(link.scheme().empty());
    631         }
    632       }
    633     }
    634     ASSERT_EQ(original_base_tag_count, kTotalBaseTagCountInTestFile);
    635     // Make sure in original document, the base URL is not equal with the
    636     // |path_dir_url|.
    637     GURL original_base_url(doc.baseURL());
    638     ASSERT_NE(original_base_url, path_dir_url);
    639 
    640     // Do serialization.
    641     SerializeDomForURL(file_url, false);
    642 
    643     // Load the serialized contents.
    644     ASSERT_TRUE(HasSerializedFrame(file_url));
    645     const std::string& serialized_contents =
    646         GetSerializedContentForFrame(file_url);
    647     LoadContents(serialized_contents, file_url,
    648                  web_frame->document().encoding());
    649 
    650     // Make sure all links are absolute URLs and doc there are some number of
    651     // BASE tags in serialized HTML data. Each of those BASE tags have same base
    652     // URL which is as same as URL of current test file.
    653     web_frame = GetMainFrame();
    654     ASSERT_TRUE(web_frame != NULL);
    655     doc = web_frame->document();
    656     ASSERT_TRUE(doc.isHTMLDocument());
    657     // Go through all descent nodes.
    658     all = doc.all();
    659     int new_base_tag_count = 0;
    660     for (WebNode node = all.firstItem(); !node.isNull();
    661          node = all.nextItem()) {
    662       if (!node.isElementNode())
    663         continue;
    664       WebElement element = node.to<WebElement>();
    665       if (element.hasTagName("base")) {
    666         new_base_tag_count++;
    667       } else {
    668         // Get link.
    669         WebString value = GetSubResourceLinkFromElement(element);
    670         if (value.isNull() && element.hasTagName("a")) {
    671           value = element.getAttribute("href");
    672           if (value.isEmpty())
    673             value = WebString();
    674         }
    675         // Each link is absolute link.
    676         if (!value.isNull()) {
    677           GURL link(std::string(value.utf8()));
    678           ASSERT_FALSE(link.scheme().empty());
    679         }
    680       }
    681     }
    682     // We have one more added BASE tag which is generated by JavaScript.
    683     ASSERT_EQ(new_base_tag_count, original_base_tag_count + 1);
    684     // Make sure in new document, the base URL is equal with the |path_dir_url|.
    685     GURL new_base_url(doc.baseURL());
    686     ASSERT_EQ(new_base_url, path_dir_url);
    687   }
    688 
    689   void SerializeHTMLDOMWithEmptyHeadOnRenderer() {
    690     base::FilePath page_file_path = GetTestFilePath(
    691         "dom_serializer", "empty_head.htm");
    692     GURL file_url = net::FilePathToFileURL(page_file_path);
    693     ASSERT_TRUE(file_url.SchemeIsFile());
    694 
    695     // Load the test html content.
    696     static const char* const empty_head_contents =
    697       "<html><head></head><body>hello world</body></html>";
    698     LoadContents(empty_head_contents, file_url, WebString());
    699 
    700     // Make sure the head tag is empty.
    701     WebFrame* web_frame = GetMainFrame();
    702     ASSERT_TRUE(web_frame != NULL);
    703     WebDocument doc = web_frame->document();
    704     ASSERT_TRUE(doc.isHTMLDocument());
    705     WebElement head_element = doc.head();
    706     ASSERT_TRUE(!head_element.isNull());
    707     ASSERT_TRUE(!head_element.hasChildNodes());
    708     ASSERT_TRUE(head_element.childNodes().length() == 0);
    709 
    710     // Do serialization.
    711     SerializeDomForURL(file_url, false);
    712     // Make sure the serialized contents have META ;
    713     ASSERT_TRUE(HasSerializedFrame(file_url));
    714     const std::string& serialized_contents =
    715         GetSerializedContentForFrame(file_url);
    716 
    717     // Reload serialized contents and make sure there is only one META tag.
    718     LoadContents(serialized_contents, file_url,
    719                  web_frame->document().encoding());
    720     web_frame = GetMainFrame();
    721     ASSERT_TRUE(web_frame != NULL);
    722     doc = web_frame->document();
    723     ASSERT_TRUE(doc.isHTMLDocument());
    724     head_element = doc.head();
    725     ASSERT_TRUE(!head_element.isNull());
    726     ASSERT_TRUE(head_element.hasChildNodes());
    727     ASSERT_TRUE(head_element.childNodes().length() == 1);
    728     WebNode meta_node = head_element.firstChild();
    729     ASSERT_TRUE(!meta_node.isNull());
    730     // Get meta charset info.
    731     std::string charset_info;
    732     ASSERT_TRUE(IsMetaElement(meta_node, charset_info));
    733     ASSERT_TRUE(!charset_info.empty());
    734     ASSERT_EQ(charset_info,
    735               std::string(web_frame->document().encoding().utf8()));
    736 
    737     // Check the body's first node is text node and its contents are
    738     // "hello world"
    739     WebElement body_element = doc.body();
    740     ASSERT_TRUE(!body_element.isNull());
    741     WebNode text_node = body_element.firstChild();
    742     ASSERT_TRUE(text_node.isTextNode());
    743     WebString text_node_contents = text_node.nodeValue();
    744     ASSERT_TRUE(std::string(text_node_contents.utf8()) == "hello world");
    745   }
    746 
    747   void SerializeDocumentWithDownloadedIFrameOnRenderer(const GURL& file_url) {
    748     // Do a recursive serialization. We pass if we don't crash.
    749     SerializeDomForURL(file_url, true);
    750   }
    751 
    752   void SubResourceForElementsInNonHTMLNamespaceOnRenderer(
    753       const GURL& file_url) {
    754     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    755     ASSERT_TRUE(web_frame != NULL);
    756     WebDocument doc = web_frame->document();
    757     WebNode lastNodeInBody = doc.body().lastChild();
    758     ASSERT_EQ(WebNode::ElementNode, lastNodeInBody.nodeType());
    759     WebString uri = GetSubResourceLinkFromElement(
    760         lastNodeInBody.to<WebElement>());
    761     EXPECT_TRUE(uri.isNull());
    762   }
    763 
    764  private:
    765   // Map frame_url to corresponding serialized_content.
    766   typedef base::hash_map<std::string, std::string> SerializedFrameContentMap;
    767   SerializedFrameContentMap serialized_frame_map_;
    768   // Map frame_url to corresponding status of serialization finish.
    769   typedef base::hash_map<std::string, bool> SerializationFinishStatusMap;
    770   SerializationFinishStatusMap serialization_finish_status_;
    771   // Flag indicates whether the process of serializing DOM is finished or not.
    772   bool serialized_;
    773   // The local_directory_name_ is dummy relative path of directory which
    774   // contain all saved auxiliary files included all sub frames and resources.
    775   const base::FilePath local_directory_name_;
    776 };
    777 
    778 // If original contents have document type, the serialized contents also have
    779 // document type.
    780 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithDocType) {
    781   base::FilePath page_file_path =
    782       GetTestFilePath("dom_serializer", "youtube_1.htm");
    783   GURL file_url = net::FilePathToFileURL(page_file_path);
    784   ASSERT_TRUE(file_url.SchemeIsFile());
    785   // Load the test file.
    786   NavigateToURL(shell(), file_url);
    787 
    788   PostTaskToInProcessRendererAndWait(
    789         base::Bind(&DomSerializerTests::SerializeHTMLDOMWithDocTypeOnRenderer,
    790                    base::Unretained(this), file_url));
    791 }
    792 
    793 // If original contents do not have document type, the serialized contents
    794 // also do not have document type.
    795 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithoutDocType) {
    796   base::FilePath page_file_path =
    797       GetTestFilePath("dom_serializer", "youtube_2.htm");
    798   GURL file_url = net::FilePathToFileURL(page_file_path);
    799   ASSERT_TRUE(file_url.SchemeIsFile());
    800   // Load the test file.
    801   NavigateToURL(shell(), file_url);
    802 
    803   PostTaskToInProcessRendererAndWait(
    804         base::Bind(
    805             &DomSerializerTests::SerializeHTMLDOMWithoutDocTypeOnRenderer,
    806             base::Unretained(this), file_url));
    807 }
    808 
    809 // Serialize XML document which has all 5 built-in entities. After
    810 // finishing serialization, the serialized contents should be same
    811 // with original XML document.
    812 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeXMLDocWithBuiltInEntities) {
    813   base::FilePath page_file_path =
    814       GetTestFilePath("dom_serializer", "note.html");
    815   base::FilePath xml_file_path = GetTestFilePath("dom_serializer", "note.xml");
    816   // Read original contents for later comparison.
    817   std::string original_contents;
    818   ASSERT_TRUE(base::ReadFileToString(xml_file_path, &original_contents));
    819   // Get file URL.
    820   GURL file_url = net::FilePathToFileURL(page_file_path);
    821   GURL xml_file_url = net::FilePathToFileURL(xml_file_path);
    822   ASSERT_TRUE(file_url.SchemeIsFile());
    823   // Load the test file.
    824   NavigateToURL(shell(), file_url);
    825 
    826   PostTaskToInProcessRendererAndWait(
    827         base::Bind(
    828             &DomSerializerTests::SerializeXMLDocWithBuiltInEntitiesOnRenderer,
    829             base::Unretained(this), xml_file_url, original_contents));
    830 }
    831 
    832 // When serializing DOM, we add MOTW declaration before html tag.
    833 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithAddingMOTW) {
    834   base::FilePath page_file_path =
    835       GetTestFilePath("dom_serializer", "youtube_2.htm");
    836   // Read original contents for later comparison .
    837   std::string original_contents;
    838   ASSERT_TRUE(base::ReadFileToString(page_file_path, &original_contents));
    839   // Get file URL.
    840   GURL file_url = net::FilePathToFileURL(page_file_path);
    841   ASSERT_TRUE(file_url.SchemeIsFile());
    842 
    843   // Load the test file.
    844   NavigateToURL(shell(), file_url);
    845 
    846   PostTaskToInProcessRendererAndWait(
    847         base::Bind(
    848             &DomSerializerTests::SerializeHTMLDOMWithAddingMOTWOnRenderer,
    849             base::Unretained(this), file_url, original_contents));
    850 }
    851 
    852 // When serializing DOM, we will add the META which have correct charset
    853 // declaration as first child of HEAD element for resolving WebKit bug:
    854 // http://bugs.webkit.org/show_bug.cgi?id=16621 even the original document
    855 // does not have META charset declaration.
    856 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
    857                        SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc) {
    858   base::FilePath page_file_path =
    859       GetTestFilePath("dom_serializer", "youtube_1.htm");
    860   // Get file URL.
    861   GURL file_url = net::FilePathToFileURL(page_file_path);
    862   ASSERT_TRUE(file_url.SchemeIsFile());
    863   // Load the test file.
    864   NavigateToURL(shell(), file_url);
    865 
    866   PostTaskToInProcessRendererAndWait(
    867         base::Bind(
    868             &DomSerializerTests::
    869                 SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer,
    870             base::Unretained(this), file_url));
    871 }
    872 
    873 // When serializing DOM, if the original document has multiple META charset
    874 // declaration, we will add the META which have correct charset declaration
    875 // as first child of HEAD element and remove all original META charset
    876 // declarations.
    877 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
    878                        SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDoc) {
    879   base::FilePath page_file_path =
    880       GetTestFilePath("dom_serializer", "youtube_2.htm");
    881   // Get file URL.
    882   GURL file_url = net::FilePathToFileURL(page_file_path);
    883   ASSERT_TRUE(file_url.SchemeIsFile());
    884   // Load the test file.
    885   NavigateToURL(shell(), file_url);
    886 
    887   PostTaskToInProcessRendererAndWait(
    888         base::Bind(
    889             &DomSerializerTests::
    890                 SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer,
    891             base::Unretained(this), file_url));
    892 }
    893 
    894 // Test situation of html entities in text when serializing HTML DOM.
    895 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithEntitiesInText) {
    896   // Need to spin up the renderer and also navigate to a file url so that the
    897   // renderer code doesn't attempt a fork when it sees a load to file scheme
    898   // from non-file scheme.
    899   NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
    900 
    901   PostTaskToInProcessRendererAndWait(
    902         base::Bind(
    903             &DomSerializerTests::SerializeHTMLDOMWithEntitiesInTextOnRenderer,
    904             base::Unretained(this)));
    905 }
    906 
    907 // Test situation of html entities in attribute value when serializing
    908 // HTML DOM.
    909 // This test started to fail at WebKit r65388. See http://crbug.com/52279.
    910 //
    911 // TODO(tiger (at) opera.com): Disabled in preparation of page serializer merge --
    912 // Some attributes are handled differently in the merged serializer.
    913 // Bug: http://crbug.com/328354
    914 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
    915                        DISABLE_SerializeHTMLDOMWithEntitiesInAttributeValue) {
    916   // Need to spin up the renderer and also navigate to a file url so that the
    917   // renderer code doesn't attempt a fork when it sees a load to file scheme
    918   // from non-file scheme.
    919   NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
    920 
    921   PostTaskToInProcessRendererAndWait(
    922         base::Bind(
    923             &DomSerializerTests::
    924                 SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer,
    925             base::Unretained(this)));
    926 }
    927 
    928 // Test situation of non-standard HTML entities when serializing HTML DOM.
    929 // This test started to fail at WebKit r65351. See http://crbug.com/52279.
    930 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
    931                        SerializeHTMLDOMWithNonStandardEntities) {
    932   // Make a test file URL and load it.
    933   base::FilePath page_file_path = GetTestFilePath(
    934       "dom_serializer", "nonstandard_htmlentities.htm");
    935   GURL file_url = net::FilePathToFileURL(page_file_path);
    936   NavigateToURL(shell(), file_url);
    937 
    938   PostTaskToInProcessRendererAndWait(
    939         base::Bind(
    940             &DomSerializerTests::
    941                 SerializeHTMLDOMWithNonStandardEntitiesOnRenderer,
    942             base::Unretained(this), file_url));
    943 }
    944 
    945 // Test situation of BASE tag in original document when serializing HTML DOM.
    946 // When serializing, we should comment the BASE tag, append a new BASE tag.
    947 // rewrite all the savable URLs to relative local path, and change other URLs
    948 // to absolute URLs.
    949 //
    950 // TODO(tiger (at) opera.com): Disabled in preparation of page serializer merge --
    951 // Base tags are handled a bit different in merged version.
    952 // Bug: http://crbug.com/328354
    953 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
    954                        DISABLE_SerializeHTMLDOMWithBaseTag) {
    955   base::FilePath page_file_path = GetTestFilePath(
    956       "dom_serializer", "html_doc_has_base_tag.htm");
    957 
    958   // Get page dir URL which is base URL of this file.
    959   base::FilePath dir_name = page_file_path.DirName();
    960   dir_name = dir_name.Append(
    961       base::FilePath::StringType(base::FilePath::kSeparators[0], 1));
    962   GURL path_dir_url = net::FilePathToFileURL(dir_name);
    963 
    964   // Get file URL.
    965   GURL file_url = net::FilePathToFileURL(page_file_path);
    966   ASSERT_TRUE(file_url.SchemeIsFile());
    967   // Load the test file.
    968   NavigateToURL(shell(), file_url);
    969 
    970   PostTaskToInProcessRendererAndWait(
    971         base::Bind(
    972             &DomSerializerTests::SerializeHTMLDOMWithBaseTagOnRenderer,
    973             base::Unretained(this), file_url, path_dir_url));
    974 }
    975 
    976 // Serializing page which has an empty HEAD tag.
    977 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithEmptyHead) {
    978   // Need to spin up the renderer and also navigate to a file url so that the
    979   // renderer code doesn't attempt a fork when it sees a load to file scheme
    980   // from non-file scheme.
    981   NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
    982 
    983   PostTaskToInProcessRendererAndWait(
    984         base::Bind(&DomSerializerTests::SerializeHTMLDOMWithEmptyHeadOnRenderer,
    985                    base::Unretained(this)));
    986 }
    987 
    988 // Test that we don't crash when the page contains an iframe that
    989 // was handled as a download (http://crbug.com/42212).
    990 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
    991                        SerializeDocumentWithDownloadedIFrame) {
    992   base::FilePath page_file_path = GetTestFilePath(
    993       "dom_serializer", "iframe-src-is-exe.htm");
    994   GURL file_url = net::FilePathToFileURL(page_file_path);
    995   ASSERT_TRUE(file_url.SchemeIsFile());
    996   // Load the test file.
    997   NavigateToURL(shell(), file_url);
    998 
    999   PostTaskToInProcessRendererAndWait(
   1000         base::Bind(
   1001             &DomSerializerTests::
   1002                 SerializeDocumentWithDownloadedIFrameOnRenderer,
   1003             base::Unretained(this), file_url));
   1004 }
   1005 
   1006 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
   1007                        SubResourceForElementsInNonHTMLNamespace) {
   1008   base::FilePath page_file_path = GetTestFilePath(
   1009       "dom_serializer", "non_html_namespace.htm");
   1010   GURL file_url = net::FilePathToFileURL(page_file_path);
   1011   NavigateToURL(shell(), file_url);
   1012 
   1013   PostTaskToInProcessRendererAndWait(
   1014         base::Bind(
   1015             &DomSerializerTests::
   1016                 SubResourceForElementsInNonHTMLNamespaceOnRenderer,
   1017             base::Unretained(this), file_url));
   1018 }
   1019 
   1020 }  // namespace content
   1021