      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      5 #include "base/bind.h"
      6 #include "base/command_line.h"
      7 #include "base/compiler_specific.h"
      8 #include "base/containers/hash_tables.h"
      9 #include "base/file_util.h"
     10 #include "base/files/file_path.h"
     11 #include "base/strings/string_util.h"
     12 #include "base/strings/utf_string_conversions.h"
     13 #include "content/public/common/content_switches.h"
     14 #include "content/public/renderer/render_view.h"
     15 #include "content/public/renderer/render_view_observer.h"
     16 #include "content/public/test/content_browser_test.h"
     17 #include "content/public/test/content_browser_test_utils.h"
     18 #include "content/public/test/test_utils.h"
     19 #include "content/renderer/savable_resources.h"
     20 #include "content/shell/browser/shell.h"
     21 #include "net/base/filename_util.h"
     22 #include "net/url_request/url_request_context.h"
     23 #include "third_party/WebKit/public/platform/WebCString.h"
     24 #include "third_party/WebKit/public/platform/WebData.h"
     25 #include "third_party/WebKit/public/platform/WebString.h"
     26 #include "third_party/WebKit/public/platform/WebURL.h"
     27 #include "third_party/WebKit/public/platform/WebVector.h"
     28 #include "third_party/WebKit/public/web/WebDocument.h"
     29 #include "third_party/WebKit/public/web/WebElement.h"
     30 #include "third_party/WebKit/public/web/WebElementCollection.h"
     31 #include "third_party/WebKit/public/web/WebLocalFrame.h"
     32 #include "third_party/WebKit/public/web/WebNode.h"
     33 #include "third_party/WebKit/public/web/WebNodeList.h"
     34 #include "third_party/WebKit/public/web/WebPageSerializer.h"
     35 #include "third_party/WebKit/public/web/WebPageSerializerClient.h"
     36 #include "third_party/WebKit/public/web/WebView.h"
     38 using blink::WebCString;
     39 using blink::WebData;
     40 using blink::WebDocument;
     41 using blink::WebElement;
     42 using blink::WebElementCollection;
     43 using blink::WebFrame;
     44 using blink::WebLocalFrame;
     45 using blink::WebNode;
     46 using blink::WebNodeList;
     47 using blink::WebPageSerializer;
     48 using blink::WebPageSerializerClient;
     49 using blink::WebString;
     50 using blink::WebURL;
     51 using blink::WebView;
     52 using blink::WebVector;
     54 namespace {
     56 // The first RenderFrame is routing ID 1, and the first RenderView is 2.
     57 const int kRenderViewRoutingId = 2;
     59 }
     61 namespace content {
     63 // Iterate recursively over sub-frames to find one with with a given url.
     64 WebFrame* FindSubFrameByURL(WebView* web_view, const GURL& url) {
     65   if (!web_view->mainFrame())
     66     return NULL;
     68   std::vector<WebFrame*> stack;
     69   stack.push_back(web_view->mainFrame());
     71   while (!stack.empty()) {
     72     WebFrame* current_frame = stack.back();
     73     stack.pop_back();
     74     if (GURL(current_frame->document().url()) == url)
     75       return current_frame;
     76     WebElementCollection all = current_frame->document().all();
     77     for (WebElement element = all.firstItem();
     78          !element.isNull(); element = all.nextItem()) {
     79       // Check frame tag and iframe tag
     80       if (!element.hasTagName("frame") && !element.hasTagName("iframe"))
     81         continue;
     82       WebFrame* sub_frame = WebLocalFrame::fromFrameOwnerElement(element);
     83       if (sub_frame)
     84         stack.push_back(sub_frame);
     85     }
     86   }
     87   return NULL;
     88 }
     90 // Helper function that test whether the first node in the doc is a doc type
     91 // node.
     92 bool HasDocType(const WebDocument& doc) {
     93   WebNode node = doc.firstChild();
     94   if (node.isNull())
     95     return false;
     96   return node.nodeType() == WebNode::DocumentTypeNode;
     97 }
     99   // Helper function for checking whether input node is META tag. Return true
    100 // means it is META element, otherwise return false. The parameter charset_info
    101 // return actual charset info if the META tag has charset declaration.
    102 bool IsMetaElement(const WebNode& node, std::string& charset_info) {
    103   if (!node.isElementNode())
    104     return false;
    105   const WebElement meta = node.toConst<WebElement>();
    106   if (!meta.hasTagName("meta"))
    107     return false;
    108   charset_info.erase(0, charset_info.length());
    109   // Check the META charset declaration.
    110   WebString httpEquiv = meta.getAttribute("http-equiv");
    111   if (LowerCaseEqualsASCII(httpEquiv, "content-type")) {
    112     std::string content = meta.getAttribute("content").utf8();
    113     int pos = content.find("charset", 0);
    114     if (pos > -1) {
    115       // Add a dummy charset declaration to charset_info, which indicates this
    116       // META tag has charset declaration although we do not get correct value
    117       // yet.
    118       charset_info.append("has-charset-declaration");
    119       int remaining_length = content.length() - pos - 7;
    120       if (!remaining_length)
    121         return true;
    122       int start_pos = pos + 7;
    123       // Find "=" symbol.
    124       while (remaining_length--)
    125         if (content[start_pos++] == L'=')
    126           break;
    127       // Skip beginning space.
    128       while (remaining_length) {
    129         if (content[start_pos] > 0x0020)
    130           break;
    131         ++start_pos;
    132         --remaining_length;
    133       }
    134       if (!remaining_length)
    135         return true;
    136       int end_pos = start_pos;
    137       // Now we find out the start point of charset info. Search the end point.
    138       while (remaining_length--) {
    139         if (content[end_pos] <= 0x0020 || content[end_pos] == L';')
    140           break;
    141         ++end_pos;
    142       }
    143       // Get actual charset info.
    144       charset_info = content.substr(start_pos, end_pos - start_pos);
    145       return true;
    146     }
    147   }
    148   return true;
    149 }
    151 class LoadObserver : public RenderViewObserver {
    152  public:
    153   LoadObserver(RenderView* render_view, const base::Closure& quit_closure)
    154       : RenderViewObserver(render_view),
    155         quit_closure_(quit_closure) {}
    157   virtual void DidFinishLoad(blink::WebLocalFrame* frame) OVERRIDE {
    158     if (frame == render_view()->GetWebView()->mainFrame())
    159       quit_closure_.Run();
    160   }
    162  private:
    163   base::Closure quit_closure_;
    164 };
    166 class DomSerializerTests : public ContentBrowserTest,
    167                            public WebPageSerializerClient {
    168  public:
    169   DomSerializerTests()
    170     : serialized_(false),
    171       local_directory_name_(FILE_PATH_LITERAL("./dummy_files/")) {}
    173   virtual void SetUpCommandLine(CommandLine* command_line) OVERRIDE {
    174     command_line->AppendSwitch(switches::kSingleProcess);
    175 #if defined(OS_WIN)
    176     // Don't want to try to create a GPU process.
    177     command_line->AppendSwitch(switches::kDisableGpu);
    178 #endif
    179   }
    181   // DomSerializerDelegate.
    182   virtual void didSerializeDataForFrame(const WebURL& frame_web_url,
    183                                         const WebCString& data,
    184                                         PageSerializationStatus status) {
    186     GURL frame_url(frame_web_url);
    187     // If the all frames are finished saving, check all finish status
    188     if (status == WebPageSerializerClient::AllFramesAreFinished) {
    189       SerializationFinishStatusMap::iterator it =
    190           serialization_finish_status_.begin();
    191       for (; it != serialization_finish_status_.end(); ++it)
    192         ASSERT_TRUE(it->second);
    193       serialized_ = true;
    194       return;
    195     }
    197     // Check finish status of current frame.
    198     SerializationFinishStatusMap::iterator it =
    199         serialization_finish_status_.find(frame_url.spec());
    200     // New frame, set initial status as false.
    201     if (it == serialization_finish_status_.end())
    202       serialization_finish_status_[frame_url.spec()] = false;
    204     it = serialization_finish_status_.find(frame_url.spec());
    205     ASSERT_TRUE(it != serialization_finish_status_.end());
    206     // In process frame, finish status should be false.
    207     ASSERT_FALSE(it->second);
    209     // Add data to corresponding frame's content.
    210     serialized_frame_map_[frame_url.spec()] += data.data();
    212     // Current frame is completed saving, change the finish status.
    213     if (status == WebPageSerializerClient::CurrentFrameIsFinished)
    214       it->second = true;
    215   }
    217   bool HasSerializedFrame(const GURL& frame_url) {
    218     return serialized_frame_map_.find(frame_url.spec()) !=
    219            serialized_frame_map_.end();
    220   }
    222   const std::string& GetSerializedContentForFrame(
    223       const GURL& frame_url) {
    224     return serialized_frame_map_[frame_url.spec()];
    225   }
    227   RenderView* GetRenderView() {
    228     // We could have the test on the UI thread get the WebContent's routing ID,
    229     // but we know this will be the first RV so skip that and just hardcode it.
    230     return RenderView::FromRoutingID(kRenderViewRoutingId);
    231   }
    233   WebView* GetWebView() {
    234     return GetRenderView()->GetWebView();
    235   }
    237   WebFrame* GetMainFrame() {
    238     return GetWebView()->mainFrame();
    239   }
    241   // Load web page according to input content and relative URLs within
    242   // the document.
    243   void LoadContents(const std::string& contents,
    244                     const GURL& base_url,
    245                     const WebString encoding_info) {
    246     scoped_refptr<MessageLoopRunner> runner = new MessageLoopRunner;
    247     LoadObserver observer(GetRenderView(), runner->QuitClosure());
    249     // If input encoding is empty, use UTF-8 as default encoding.
    250     if (encoding_info.isEmpty()) {
    251       GetMainFrame()->loadHTMLString(contents, base_url);
    252     } else {
    253       WebData data(contents.data(), contents.length());
    255       // Do not use WebFrame.LoadHTMLString because it assumes that input
    256       // html contents use UTF-8 encoding.
    257       // TODO(darin): This should use WebFrame::loadData.
    258       WebFrame* web_frame = GetMainFrame();
    260       ASSERT_TRUE(web_frame != NULL);
    262       web_frame->loadData(data, "text/html", encoding_info, base_url);
    263     }
    265     runner->Run();
    266   }
    268   // Serialize page DOM according to specific page URL. The parameter
    269   // recursive_serialization indicates whether we will serialize all
    270   // sub-frames.
    271   void SerializeDomForURL(const GURL& page_url,
    272                           bool recursive_serialization) {
    273     // Find corresponding WebFrame according to page_url.
    274     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), page_url);
    275     ASSERT_TRUE(web_frame != NULL);
    276     WebVector<WebURL> links;
    277     links.assign(&page_url, 1);
    278     WebString file_path =
    279         base::FilePath(FILE_PATH_LITERAL("c:\\dummy.htm")).AsUTF16Unsafe();
    280     WebVector<WebString> local_paths;
    281     local_paths.assign(&file_path, 1);
    282     // Start serializing DOM.
    283     bool result = WebPageSerializer::serialize(web_frame->toWebLocalFrame(),
    284        recursive_serialization,
    285        static_cast<WebPageSerializerClient*>(this),
    286        links,
    287        local_paths,
    288        local_directory_name_.AsUTF16Unsafe());
    289     ASSERT_TRUE(result);
    290     ASSERT_TRUE(serialized_);
    291   }
    293   void SerializeHTMLDOMWithDocTypeOnRenderer(const GURL& file_url) {
    294     // Make sure original contents have document type.
    295     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    296     ASSERT_TRUE(web_frame != NULL);
    297     WebDocument doc = web_frame->document();
    298     ASSERT_TRUE(HasDocType(doc));
    299     // Do serialization.
    300     SerializeDomForURL(file_url, false);
    301     // Load the serialized contents.
    302     ASSERT_TRUE(HasSerializedFrame(file_url));
    303     const std::string& serialized_contents =
    304         GetSerializedContentForFrame(file_url);
    305     LoadContents(serialized_contents, file_url,
    306                  web_frame->document().encoding());
    307     // Make sure serialized contents still have document type.
    308     web_frame = GetMainFrame();
    309     doc = web_frame->document();
    310     ASSERT_TRUE(HasDocType(doc));
    311   }
    313   void SerializeHTMLDOMWithoutDocTypeOnRenderer(const GURL& file_url) {
    314     // Make sure original contents do not have document type.
    315     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    316     ASSERT_TRUE(web_frame != NULL);
    317     WebDocument doc = web_frame->document();
    318     ASSERT_TRUE(!HasDocType(doc));
    319     // Do serialization.
    320     SerializeDomForURL(file_url, false);
    321     // Load the serialized contents.
    322     ASSERT_TRUE(HasSerializedFrame(file_url));
    323     const std::string& serialized_contents =
    324         GetSerializedContentForFrame(file_url);
    325     LoadContents(serialized_contents, file_url,
    326                  web_frame->document().encoding());
    327     // Make sure serialized contents do not have document type.
    328     web_frame = GetMainFrame();
    329     doc = web_frame->document();
    330     ASSERT_TRUE(!HasDocType(doc));
    331   }
    333   void SerializeXMLDocWithBuiltInEntitiesOnRenderer(
    334       const GURL& xml_file_url, const std::string& original_contents) {
    335     // Do serialization.
    336     SerializeDomForURL(xml_file_url, false);
    337     // Compare the serialized contents with original contents.
    338     ASSERT_TRUE(HasSerializedFrame(xml_file_url));
    339     const std::string& serialized_contents =
    340         GetSerializedContentForFrame(xml_file_url);
    341     ASSERT_EQ(original_contents, serialized_contents);
    342   }
    344   void SerializeHTMLDOMWithAddingMOTWOnRenderer(
    345       const GURL& file_url, const std::string& original_contents) {
    346     // Make sure original contents does not have MOTW;
    347     std::string motw_declaration =
    348        WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
    349     ASSERT_FALSE(motw_declaration.empty());
    350     // The encoding of original contents is ISO-8859-1, so we convert the MOTW
    351     // declaration to ASCII and search whether original contents has it or not.
    352     ASSERT_TRUE(std::string::npos == original_contents.find(motw_declaration));
    354     // Do serialization.
    355     SerializeDomForURL(file_url, false);
    356     // Make sure the serialized contents have MOTW ;
    357     ASSERT_TRUE(HasSerializedFrame(file_url));
    358     const std::string& serialized_contents =
    359         GetSerializedContentForFrame(file_url);
    360     ASSERT_FALSE(std::string::npos ==
    361         serialized_contents.find(motw_declaration));
    362   }
    364   void SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer(
    365       const GURL& file_url) {
    366     // Make sure there is no META charset declaration in original document.
    367     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    368     ASSERT_TRUE(web_frame != NULL);
    369     WebDocument doc = web_frame->document();
    370     ASSERT_TRUE(doc.isHTMLDocument());
    371     WebElement head_element = doc.head();
    372     ASSERT_TRUE(!head_element.isNull());
    373     // Go through all children of HEAD element.
    374     for (WebNode child = head_element.firstChild(); !child.isNull();
    375          child = child.nextSibling()) {
    376       std::string charset_info;
    377       if (IsMetaElement(child, charset_info))
    378         ASSERT_TRUE(charset_info.empty());
    379     }
    380     // Do serialization.
    381     SerializeDomForURL(file_url, false);
    383     // Load the serialized contents.
    384     ASSERT_TRUE(HasSerializedFrame(file_url));
    385     const std::string& serialized_contents =
    386         GetSerializedContentForFrame(file_url);
    387     LoadContents(serialized_contents, file_url,
    388                  web_frame->document().encoding());
    389     // Make sure the first child of HEAD element is META which has charset
    390     // declaration in serialized contents.
    391     web_frame = GetMainFrame();
    392     ASSERT_TRUE(web_frame != NULL);
    393     doc = web_frame->document();
    394     ASSERT_TRUE(doc.isHTMLDocument());
    395     head_element = doc.head();
    396     ASSERT_TRUE(!head_element.isNull());
    397     WebNode meta_node = head_element.firstChild();
    398     ASSERT_TRUE(!meta_node.isNull());
    399     // Get meta charset info.
    400     std::string charset_info2;
    401     ASSERT_TRUE(IsMetaElement(meta_node, charset_info2));
    402     ASSERT_TRUE(!charset_info2.empty());
    403     ASSERT_EQ(charset_info2,
    404               std::string(web_frame->document().encoding().utf8()));
    406     // Make sure no more additional META tags which have charset declaration.
    407     for (WebNode child = meta_node.nextSibling(); !child.isNull();
    408          child = child.nextSibling()) {
    409       std::string charset_info;
    410       if (IsMetaElement(child, charset_info))
    411         ASSERT_TRUE(charset_info.empty());
    412     }
    413   }
    415   void SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer(
    416       const GURL& file_url) {
    417     // Make sure there are multiple META charset declarations in original
    418     // document.
    419     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    420     ASSERT_TRUE(web_frame != NULL);
    421     WebDocument doc = web_frame->document();
    422     ASSERT_TRUE(doc.isHTMLDocument());
    423     WebElement head_ele = doc.head();
    424     ASSERT_TRUE(!head_ele.isNull());
    425     // Go through all children of HEAD element.
    426     int charset_declaration_count = 0;
    427     for (WebNode child = head_ele.firstChild(); !child.isNull();
    428          child = child.nextSibling()) {
    429       std::string charset_info;
    430       if (IsMetaElement(child, charset_info) && !charset_info.empty())
    431         charset_declaration_count++;
    432     }
    433     // The original doc has more than META tags which have charset declaration.
    434     ASSERT_TRUE(charset_declaration_count > 1);
    436     // Do serialization.
    437     SerializeDomForURL(file_url, false);
    439     // Load the serialized contents.
    440     ASSERT_TRUE(HasSerializedFrame(file_url));
    441     const std::string& serialized_contents =
    442         GetSerializedContentForFrame(file_url);
    443     LoadContents(serialized_contents, file_url,
    444                  web_frame->document().encoding());
    445     // Make sure only first child of HEAD element is META which has charset
    446     // declaration in serialized contents.
    447     web_frame = GetMainFrame();
    448     ASSERT_TRUE(web_frame != NULL);
    449     doc = web_frame->document();
    450     ASSERT_TRUE(doc.isHTMLDocument());
    451     head_ele = doc.head();
    452     ASSERT_TRUE(!head_ele.isNull());
    453     WebNode meta_node = head_ele.firstChild();
    454     ASSERT_TRUE(!meta_node.isNull());
    455     // Get meta charset info.
    456     std::string charset_info2;
    457     ASSERT_TRUE(IsMetaElement(meta_node, charset_info2));
    458     ASSERT_TRUE(!charset_info2.empty());
    459     ASSERT_EQ(charset_info2,
    460               std::string(web_frame->document().encoding().utf8()));
    462     // Make sure no more additional META tags which have charset declaration.
    463     for (WebNode child = meta_node.nextSibling(); !child.isNull();
    464          child = child.nextSibling()) {
    465       std::string charset_info;
    466       if (IsMetaElement(child, charset_info))
    467         ASSERT_TRUE(charset_info.empty());
    468     }
    469   }
    471   void SerializeHTMLDOMWithEntitiesInTextOnRenderer() {
    472     base::FilePath page_file_path = GetTestFilePath(
    473         "dom_serializer", "dom_serializer/htmlentities_in_text.htm");
    474     // Get file URL. The URL is dummy URL to identify the following loading
    475     // actions. The test content is in constant:original_contents.
    476     GURL file_url = net::FilePathToFileURL(page_file_path);
    477     ASSERT_TRUE(file_url.SchemeIsFile());
    478     // Test contents.
    479     static const char* const original_contents =
    480         "<html><body>&amp;&lt;&gt;\"\'</body></html>";
    481     // Load the test contents.
    482     LoadContents(original_contents, file_url, WebString());
    484     // Get BODY's text content in DOM.
    485     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    486     ASSERT_TRUE(web_frame != NULL);
    487     WebDocument doc = web_frame->document();
    488     ASSERT_TRUE(doc.isHTMLDocument());
    489     WebElement body_ele = doc.body();
    490     ASSERT_TRUE(!body_ele.isNull());
    491     WebNode text_node = body_ele.firstChild();
    492     ASSERT_TRUE(text_node.isTextNode());
    493     ASSERT_TRUE(std::string(text_node.createMarkup().utf8()) ==
    494                 "&amp;&lt;&gt;\"\'");
    495     // Do serialization.
    496     SerializeDomForURL(file_url, false);
    497     // Compare the serialized contents with original contents.
    498     ASSERT_TRUE(HasSerializedFrame(file_url));
    499     const std::string& serialized_contents =
    500         GetSerializedContentForFrame(file_url);
    501     // Compare the serialized contents with original contents to make sure
    502     // they are same.
    503     // Because we add MOTW when serializing DOM, so before comparison, we also
    504     // need to add MOTW to original_contents.
    505     std::string original_str =
    506       WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
    507     original_str += original_contents;
    508     // Since WebCore now inserts a new HEAD element if there is no HEAD element
    509     // when creating BODY element. (Please see
    510     // HTMLParser::bodyCreateErrorCheck.) We need to append the HEAD content and
    511     // corresponding META content if we find WebCore-generated HEAD element.
    512     if (!doc.head().isNull()) {
    513       WebString encoding = web_frame->document().encoding();
    514       std::string htmlTag("<html>");
    515       std::string::size_type pos = original_str.find(htmlTag);
    516       ASSERT_NE(std::string::npos, pos);
    517       pos += htmlTag.length();
    518       std::string head_part("<head>");
    519       head_part +=
    520           WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8();
    521       head_part += "</head>";
    522       original_str.insert(pos, head_part);
    523     }
    524     ASSERT_EQ(original_str, serialized_contents);
    525   }
    527   void SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer() {
    528     base::FilePath page_file_path = GetTestFilePath(
    529         "dom_serializer", "dom_serializer/htmlentities_in_attribute_value.htm");
    530     // Get file URL. The URL is dummy URL to identify the following loading
    531     // actions. The test content is in constant:original_contents.
    532     GURL file_url = net::FilePathToFileURL(page_file_path);
    533     ASSERT_TRUE(file_url.SchemeIsFile());
    534     // Test contents.
    535     static const char* const original_contents =
    536         "<html><body title=\"&amp;&lt;&gt;&quot;&#39;\"></body></html>";
    537     // Load the test contents.
    538     LoadContents(original_contents, file_url, WebString());
    539     // Get value of BODY's title attribute in DOM.
    540     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    541     ASSERT_TRUE(web_frame != NULL);
    542     WebDocument doc = web_frame->document();
    543     ASSERT_TRUE(doc.isHTMLDocument());
    544     WebElement body_ele = doc.body();
    545     ASSERT_TRUE(!body_ele.isNull());
    546     WebString value = body_ele.getAttribute("title");
    547     ASSERT_TRUE(std::string(value.utf8()) == "&<>\"\'");
    548     // Do serialization.
    549     SerializeDomForURL(file_url, false);
    550     // Compare the serialized contents with original contents.
    551     ASSERT_TRUE(HasSerializedFrame(file_url));
    552     const std::string& serialized_contents =
    553         GetSerializedContentForFrame(file_url);
    554     // Compare the serialized contents with original contents to make sure
    555     // they are same.
    556     std::string original_str =
    557         WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
    558     original_str += original_contents;
    559     if (!doc.isNull()) {
    560       WebString encoding = web_frame->document().encoding();
    561       std::string htmlTag("<html>");
    562       std::string::size_type pos = original_str.find(htmlTag);
    563       ASSERT_NE(std::string::npos, pos);
    564       pos += htmlTag.length();
    565       std::string head_part("<head>");
    566       head_part +=
    567           WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8();
    568       head_part += "</head>";
    569       original_str.insert(pos, head_part);
    570     }
    571     ASSERT_EQ(original_str, serialized_contents);
    572   }
    574   void SerializeHTMLDOMWithNonStandardEntitiesOnRenderer(const GURL& file_url) {
    575     // Get value of BODY's title attribute in DOM.
    576     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    577     WebDocument doc = web_frame->document();
    578     ASSERT_TRUE(doc.isHTMLDocument());
    579     WebElement body_element = doc.body();
    580     // Unescaped string for "&percnt;&nsup;&sup1;&apos;".
    581     static const wchar_t parsed_value[] = {
    582       '%', 0x2285, 0x00b9, '\'', 0
    583     };
    584     WebString value = body_element.getAttribute("title");
    585     ASSERT_TRUE(base::UTF16ToWide(value) == parsed_value);
    586     ASSERT_TRUE(base::UTF16ToWide(body_element.innerText()) == parsed_value);
    588     // Do serialization.
    589     SerializeDomForURL(file_url, false);
    590     // Check the serialized string.
    591     ASSERT_TRUE(HasSerializedFrame(file_url));
    592     const std::string& serialized_contents =
    593         GetSerializedContentForFrame(file_url);
    594     // Confirm that the serialized string has no non-standard HTML entities.
    595     ASSERT_EQ(std::string::npos, serialized_contents.find("&percnt;"));
    596     ASSERT_EQ(std::string::npos, serialized_contents.find("&nsup;"));
    597     ASSERT_EQ(std::string::npos, serialized_contents.find("&sup1;"));
    598     ASSERT_EQ(std::string::npos, serialized_contents.find("&apos;"));
    599   }
    601   void SerializeHTMLDOMWithBaseTagOnRenderer(const GURL& file_url,
    602                                              const GURL& path_dir_url) {
    603     // There are total 2 available base tags in this test file.
    604     const int kTotalBaseTagCountInTestFile = 2;
    606     // Since for this test, we assume there is no savable sub-resource links for
    607     // this test file, also all links are relative URLs in this test file, so we
    608     // need to check those relative URLs and make sure document has BASE tag.
    609     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    610     ASSERT_TRUE(web_frame != NULL);
    611     WebDocument doc = web_frame->document();
    612     ASSERT_TRUE(doc.isHTMLDocument());
    613     // Go through all descent nodes.
    614     WebElementCollection all = doc.all();
    615     int original_base_tag_count = 0;
    616     for (WebElement element = all.firstItem(); !element.isNull();
    617          element = all.nextItem()) {
    618       if (element.hasTagName("base")) {
    619         original_base_tag_count++;
    620       } else {
    621         // Get link.
    622         WebString value = GetSubResourceLinkFromElement(element);
    623         if (value.isNull() && element.hasTagName("a")) {
    624           value = element.getAttribute("href");
    625           if (value.isEmpty())
    626             value = WebString();
    627         }
    628         // Each link is relative link.
    629         if (!value.isNull()) {
    630           GURL link(value.utf8());
    631           ASSERT_TRUE(link.scheme().empty());
    632         }
    633       }
    634     }
    635     ASSERT_EQ(original_base_tag_count, kTotalBaseTagCountInTestFile);
    636     // Make sure in original document, the base URL is not equal with the
    637     // |path_dir_url|.
    638     GURL original_base_url(doc.baseURL());
    639     ASSERT_NE(original_base_url, path_dir_url);
    641     // Do serialization.
    642     SerializeDomForURL(file_url, false);
    644     // Load the serialized contents.
    645     ASSERT_TRUE(HasSerializedFrame(file_url));
    646     const std::string& serialized_contents =
    647         GetSerializedContentForFrame(file_url);
    648     LoadContents(serialized_contents, file_url,
    649                  web_frame->document().encoding());
    651     // Make sure all links are absolute URLs and doc there are some number of
    652     // BASE tags in serialized HTML data. Each of those BASE tags have same base
    653     // URL which is as same as URL of current test file.
    654     web_frame = GetMainFrame();
    655     ASSERT_TRUE(web_frame != NULL);
    656     doc = web_frame->document();
    657     ASSERT_TRUE(doc.isHTMLDocument());
    658     // Go through all descent nodes.
    659     all = doc.all();
    660     int new_base_tag_count = 0;
    661     for (WebNode node = all.firstItem(); !node.isNull();
    662          node = all.nextItem()) {
    663       if (!node.isElementNode())
    664         continue;
    665       WebElement element = node.to<WebElement>();
    666       if (element.hasTagName("base")) {
    667         new_base_tag_count++;
    668       } else {
    669         // Get link.
    670         WebString value = GetSubResourceLinkFromElement(element);
    671         if (value.isNull() && element.hasTagName("a")) {
    672           value = element.getAttribute("href");
    673           if (value.isEmpty())
    674             value = WebString();
    675         }
    676         // Each link is absolute link.
    677         if (!value.isNull()) {
    678           GURL link(std::string(value.utf8()));
    679           ASSERT_FALSE(link.scheme().empty());
    680         }
    681       }
    682     }
    683     // We have one more added BASE tag which is generated by JavaScript.
    684     ASSERT_EQ(new_base_tag_count, original_base_tag_count + 1);
    685     // Make sure in new document, the base URL is equal with the |path_dir_url|.
    686     GURL new_base_url(doc.baseURL());
    687     ASSERT_EQ(new_base_url, path_dir_url);
    688   }
    690   void SerializeHTMLDOMWithEmptyHeadOnRenderer() {
    691     base::FilePath page_file_path = GetTestFilePath(
    692         "dom_serializer", "empty_head.htm");
    693     GURL file_url = net::FilePathToFileURL(page_file_path);
    694     ASSERT_TRUE(file_url.SchemeIsFile());
    696     // Load the test html content.
    697     static const char* const empty_head_contents =
    698       "<html><head></head><body>hello world</body></html>";
    699     LoadContents(empty_head_contents, file_url, WebString());
    701     // Make sure the head tag is empty.
    702     WebFrame* web_frame = GetMainFrame();
    703     ASSERT_TRUE(web_frame != NULL);
    704     WebDocument doc = web_frame->document();
    705     ASSERT_TRUE(doc.isHTMLDocument());
    706     WebElement head_element = doc.head();
    707     ASSERT_TRUE(!head_element.isNull());
    708     ASSERT_TRUE(!head_element.hasChildNodes());
    709     ASSERT_TRUE(head_element.childNodes().length() == 0);
    711     // Do serialization.
    712     SerializeDomForURL(file_url, false);
    713     // Make sure the serialized contents have META ;
    714     ASSERT_TRUE(HasSerializedFrame(file_url));
    715     const std::string& serialized_contents =
    716         GetSerializedContentForFrame(file_url);
    718     // Reload serialized contents and make sure there is only one META tag.
    719     LoadContents(serialized_contents, file_url,
    720                  web_frame->document().encoding());
    721     web_frame = GetMainFrame();
    722     ASSERT_TRUE(web_frame != NULL);
    723     doc = web_frame->document();
    724     ASSERT_TRUE(doc.isHTMLDocument());
    725     head_element = doc.head();
    726     ASSERT_TRUE(!head_element.isNull());
    727     ASSERT_TRUE(head_element.hasChildNodes());
    728     ASSERT_TRUE(head_element.childNodes().length() == 1);
    729     WebNode meta_node = head_element.firstChild();
    730     ASSERT_TRUE(!meta_node.isNull());
    731     // Get meta charset info.
    732     std::string charset_info;
    733     ASSERT_TRUE(IsMetaElement(meta_node, charset_info));
    734     ASSERT_TRUE(!charset_info.empty());
    735     ASSERT_EQ(charset_info,
    736               std::string(web_frame->document().encoding().utf8()));
    738     // Check the body's first node is text node and its contents are
    739     // "hello world"
    740     WebElement body_element = doc.body();
    741     ASSERT_TRUE(!body_element.isNull());
    742     WebNode text_node = body_element.firstChild();
    743     ASSERT_TRUE(text_node.isTextNode());
    744     WebString text_node_contents = text_node.nodeValue();
    745     ASSERT_TRUE(std::string(text_node_contents.utf8()) == "hello world");
    746   }
    748   void SerializeDocumentWithDownloadedIFrameOnRenderer(const GURL& file_url) {
    749     // Do a recursive serialization. We pass if we don't crash.
    750     SerializeDomForURL(file_url, true);
    751   }
    753   void SubResourceForElementsInNonHTMLNamespaceOnRenderer(
    754       const GURL& file_url) {
    755     WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
    756     ASSERT_TRUE(web_frame != NULL);
    757     WebDocument doc = web_frame->document();
    758     WebNode lastNodeInBody = doc.body().lastChild();
    759     ASSERT_EQ(WebNode::ElementNode, lastNodeInBody.nodeType());
    760     WebString uri = GetSubResourceLinkFromElement(
    761         lastNodeInBody.to<WebElement>());
    762     EXPECT_TRUE(uri.isNull());
    763   }
    765  private:
    766   // Map frame_url to corresponding serialized_content.
    767   typedef base::hash_map<std::string, std::string> SerializedFrameContentMap;
    768   SerializedFrameContentMap serialized_frame_map_;
    769   // Map frame_url to corresponding status of serialization finish.
    770   typedef base::hash_map<std::string, bool> SerializationFinishStatusMap;
    771   SerializationFinishStatusMap serialization_finish_status_;
    772   // Flag indicates whether the process of serializing DOM is finished or not.
    773   bool serialized_;
    774   // The local_directory_name_ is dummy relative path of directory which
    775   // contain all saved auxiliary files included all sub frames and resources.
    776   const base::FilePath local_directory_name_;
    777 };
    779 // If original contents have document type, the serialized contents also have
    780 // document type.
    781 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithDocType) {
    782   base::FilePath page_file_path =
    783       GetTestFilePath("dom_serializer", "youtube_1.htm");
    784   GURL file_url = net::FilePathToFileURL(page_file_path);
    785   ASSERT_TRUE(file_url.SchemeIsFile());
    786   // Load the test file.
    787   NavigateToURL(shell(), file_url);
    789   PostTaskToInProcessRendererAndWait(
    790         base::Bind(&DomSerializerTests::SerializeHTMLDOMWithDocTypeOnRenderer,
    791                    base::Unretained(this), file_url));
    792 }
    794 // If original contents do not have document type, the serialized contents
    795 // also do not have document type.
    796 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithoutDocType) {
    797   base::FilePath page_file_path =
    798       GetTestFilePath("dom_serializer", "youtube_2.htm");
    799   GURL file_url = net::FilePathToFileURL(page_file_path);
    800   ASSERT_TRUE(file_url.SchemeIsFile());
    801   // Load the test file.
    802   NavigateToURL(shell(), file_url);
    804   PostTaskToInProcessRendererAndWait(
    805         base::Bind(
    806             &DomSerializerTests::SerializeHTMLDOMWithoutDocTypeOnRenderer,
    807             base::Unretained(this), file_url));
    808 }
    810 // Serialize XML document which has all 5 built-in entities. After
    811 // finishing serialization, the serialized contents should be same
    812 // with original XML document.
    813 //
    814 // TODO(tiger (at) opera.com): Disabled in preparation of page serializer merge --
    815 // XML headers are handled differently in the merged serializer.
    816 // Bug: http://crbug.com/328354
    817 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
    818                        DISABLED_SerializeXMLDocWithBuiltInEntities) {
    819   base::FilePath page_file_path =
    820       GetTestFilePath("dom_serializer", "note.html");
    821   base::FilePath xml_file_path = GetTestFilePath("dom_serializer", "note.xml");
    822   // Read original contents for later comparison.
    823   std::string original_contents;
    824   ASSERT_TRUE(base::ReadFileToString(xml_file_path, &original_contents));
    825   // Get file URL.
    826   GURL file_url = net::FilePathToFileURL(page_file_path);
    827   GURL xml_file_url = net::FilePathToFileURL(xml_file_path);
    828   ASSERT_TRUE(file_url.SchemeIsFile());
    829   // Load the test file.
    830   NavigateToURL(shell(), file_url);
    832   PostTaskToInProcessRendererAndWait(
    833         base::Bind(
    834             &DomSerializerTests::SerializeXMLDocWithBuiltInEntitiesOnRenderer,
    835             base::Unretained(this), xml_file_url, original_contents));
    836 }
    838 // When serializing DOM, we add MOTW declaration before html tag.
    839 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithAddingMOTW) {
    840   base::FilePath page_file_path =
    841       GetTestFilePath("dom_serializer", "youtube_2.htm");
    842   // Read original contents for later comparison .
    843   std::string original_contents;
    844   ASSERT_TRUE(base::ReadFileToString(page_file_path, &original_contents));
    845   // Get file URL.
    846   GURL file_url = net::FilePathToFileURL(page_file_path);
    847   ASSERT_TRUE(file_url.SchemeIsFile());
    849   // Load the test file.
    850   NavigateToURL(shell(), file_url);
    852   PostTaskToInProcessRendererAndWait(
    853         base::Bind(
    854             &DomSerializerTests::SerializeHTMLDOMWithAddingMOTWOnRenderer,
    855             base::Unretained(this), file_url, original_contents));
    856 }
    858 // When serializing DOM, we will add the META which have correct charset
    859 // declaration as first child of HEAD element for resolving WebKit bug:
    860 // http://bugs.webkit.org/show_bug.cgi?id=16621 even the original document
    861 // does not have META charset declaration.
    862 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
    863                        SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc) {
    864   base::FilePath page_file_path =
    865       GetTestFilePath("dom_serializer", "youtube_1.htm");
    866   // Get file URL.
    867   GURL file_url = net::FilePathToFileURL(page_file_path);
    868   ASSERT_TRUE(file_url.SchemeIsFile());
    869   // Load the test file.
    870   NavigateToURL(shell(), file_url);
    872   PostTaskToInProcessRendererAndWait(
    873         base::Bind(
    874             &DomSerializerTests::
    875                 SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer,
    876             base::Unretained(this), file_url));
    877 }
    879 // When serializing DOM, if the original document has multiple META charset
    880 // declaration, we will add the META which have correct charset declaration
    881 // as first child of HEAD element and remove all original META charset
    882 // declarations.
    883 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
    884                        SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDoc) {
    885   base::FilePath page_file_path =
    886       GetTestFilePath("dom_serializer", "youtube_2.htm");
    887   // Get file URL.
    888   GURL file_url = net::FilePathToFileURL(page_file_path);
    889   ASSERT_TRUE(file_url.SchemeIsFile());
    890   // Load the test file.
    891   NavigateToURL(shell(), file_url);
    893   PostTaskToInProcessRendererAndWait(
    894         base::Bind(
    895             &DomSerializerTests::
    896                 SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer,
    897             base::Unretained(this), file_url));
    898 }
    900 // Test situation of html entities in text when serializing HTML DOM.
    901 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithEntitiesInText) {
    902   // Need to spin up the renderer and also navigate to a file url so that the
    903   // renderer code doesn't attempt a fork when it sees a load to file scheme
    904   // from non-file scheme.
    905   NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
    907   PostTaskToInProcessRendererAndWait(
    908         base::Bind(
    909             &DomSerializerTests::SerializeHTMLDOMWithEntitiesInTextOnRenderer,
    910             base::Unretained(this)));
    911 }
    913 // Test situation of html entities in attribute value when serializing
    914 // HTML DOM.
    915 // This test started to fail at WebKit r65388. See http://crbug.com/52279.
    916 //
    917 // TODO(tiger (at) opera.com): Disabled in preparation of page serializer merge --
    918 // Some attributes are handled differently in the merged serializer.
    919 // Bug: http://crbug.com/328354
    920 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
    921                        DISABLED_SerializeHTMLDOMWithEntitiesInAttributeValue) {
    922   // Need to spin up the renderer and also navigate to a file url so that the
    923   // renderer code doesn't attempt a fork when it sees a load to file scheme
    924   // from non-file scheme.
    925   NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
    927   PostTaskToInProcessRendererAndWait(
    928         base::Bind(
    929             &DomSerializerTests::
    930                 SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer,
    931             base::Unretained(this)));
    932 }
    934 // Test situation of non-standard HTML entities when serializing HTML DOM.
    935 // This test started to fail at WebKit r65351. See http://crbug.com/52279.
    936 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
    937                        SerializeHTMLDOMWithNonStandardEntities) {
    938   // Make a test file URL and load it.
    939   base::FilePath page_file_path = GetTestFilePath(
    940       "dom_serializer", "nonstandard_htmlentities.htm");
    941   GURL file_url = net::FilePathToFileURL(page_file_path);
    942   NavigateToURL(shell(), file_url);
    944   PostTaskToInProcessRendererAndWait(
    945         base::Bind(
    946             &DomSerializerTests::
    947                 SerializeHTMLDOMWithNonStandardEntitiesOnRenderer,
    948             base::Unretained(this), file_url));
    949 }
    951 // Test situation of BASE tag in original document when serializing HTML DOM.
    952 // When serializing, we should comment the BASE tag, append a new BASE tag.
    953 // rewrite all the savable URLs to relative local path, and change other URLs
    954 // to absolute URLs.
    955 //
    956 // TODO(tiger (at) opera.com): Disabled in preparation of page serializer merge --
    957 // Base tags are handled a bit different in merged version.
    958 // Bug: http://crbug.com/328354
    959 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
    960                        DISABLED_SerializeHTMLDOMWithBaseTag) {
    961   base::FilePath page_file_path = GetTestFilePath(
    962       "dom_serializer", "html_doc_has_base_tag.htm");
    964   // Get page dir URL which is base URL of this file.
    965   base::FilePath dir_name = page_file_path.DirName();
    966   dir_name = dir_name.Append(
    967       base::FilePath::StringType(base::FilePath::kSeparators[0], 1));
    968   GURL path_dir_url = net::FilePathToFileURL(dir_name);
    970   // Get file URL.
    971   GURL file_url = net::FilePathToFileURL(page_file_path);
    972   ASSERT_TRUE(file_url.SchemeIsFile());
    973   // Load the test file.
    974   NavigateToURL(shell(), file_url);
    976   PostTaskToInProcessRendererAndWait(
    977         base::Bind(
    978             &DomSerializerTests::SerializeHTMLDOMWithBaseTagOnRenderer,
    979             base::Unretained(this), file_url, path_dir_url));
    980 }
    982 // Serializing page which has an empty HEAD tag.
    983 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithEmptyHead) {
    984   // Need to spin up the renderer and also navigate to a file url so that the
    985   // renderer code doesn't attempt a fork when it sees a load to file scheme
    986   // from non-file scheme.
    987   NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
    989   PostTaskToInProcessRendererAndWait(
    990         base::Bind(&DomSerializerTests::SerializeHTMLDOMWithEmptyHeadOnRenderer,
    991                    base::Unretained(this)));
    992 }
    994 // Test that we don't crash when the page contains an iframe that
    995 // was handled as a download (http://crbug.com/42212).
    996 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
    997                        SerializeDocumentWithDownloadedIFrame) {
    998   base::FilePath page_file_path = GetTestFilePath(
    999       "dom_serializer", "iframe-src-is-exe.htm");
   1000   GURL file_url = net::FilePathToFileURL(page_file_path);
   1001   ASSERT_TRUE(file_url.SchemeIsFile());
   1002   // Load the test file.
   1003   NavigateToURL(shell(), file_url);
   1005   PostTaskToInProcessRendererAndWait(
   1006         base::Bind(
   1007             &DomSerializerTests::
   1008                 SerializeDocumentWithDownloadedIFrameOnRenderer,
   1009             base::Unretained(this), file_url));
   1010 }
   1012 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
   1013                        SubResourceForElementsInNonHTMLNamespace) {
   1014   base::FilePath page_file_path = GetTestFilePath(
   1015       "dom_serializer", "non_html_namespace.htm");
   1016   GURL file_url = net::FilePathToFileURL(page_file_path);
   1017   NavigateToURL(shell(), file_url);
   1019   PostTaskToInProcessRendererAndWait(
   1020         base::Bind(
   1021             &DomSerializerTests::
   1022                 SubResourceForElementsInNonHTMLNamespaceOnRenderer,
   1023             base::Unretained(this), file_url));
   1024 }
   1026 }  // namespace content