Home | History | Annotate | Download | only in glue
      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "base/compiler_specific.h"
      6 #include "base/file_path.h"
      7 #include "base/file_util.h"
      8 #include "base/hash_tables.h"
      9 #include "base/string_util.h"
     10 #include "base/utf_string_conversions.h"
     11 #include "net/base/net_util.h"
     12 #include "net/url_request/url_request_context.h"
     13 #include "third_party/WebKit/Source/WebKit/chromium/public/WebCString.h"
     14 #include "third_party/WebKit/Source/WebKit/chromium/public/WebData.h"
     15 #include "third_party/WebKit/Source/WebKit/chromium/public/WebDocument.h"
     16 #include "third_party/WebKit/Source/WebKit/chromium/public/WebElement.h"
     17 #include "third_party/WebKit/Source/WebKit/chromium/public/WebFrame.h"
     18 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNode.h"
     19 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNodeCollection.h"
     20 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNodeList.h"
     21 #include "third_party/WebKit/Source/WebKit/chromium/public/WebPageSerializer.h"
     22 #include "third_party/WebKit/Source/WebKit/chromium/public/WebPageSerializerClient.h"
     23 #include "third_party/WebKit/Source/WebKit/chromium/public/WebString.h"
     24 #include "third_party/WebKit/Source/WebKit/chromium/public/WebURL.h"
     25 #include "third_party/WebKit/Source/WebKit/chromium/public/WebVector.h"
     26 #include "third_party/WebKit/Source/WebKit/chromium/public/WebView.h"
     27 #include "webkit/glue/dom_operations.h"
     28 #include "webkit/glue/webkit_glue.h"
     29 #include "webkit/tools/test_shell/simple_resource_loader_bridge.h"
     30 #include "webkit/tools/test_shell/test_shell_test.h"
     31 
     32 using WebKit::WebCString;
     33 using WebKit::WebData;
     34 using WebKit::WebDocument;
     35 using WebKit::WebElement;
     36 using WebKit::WebFrame;
     37 using WebKit::WebNode;
     38 using WebKit::WebNodeCollection;
     39 using WebKit::WebNodeList;
     40 using WebKit::WebPageSerializer;
     41 using WebKit::WebPageSerializerClient;
     42 using WebKit::WebNode;
     43 using WebKit::WebString;
     44 using WebKit::WebURL;
     45 using WebKit::WebView;
     46 using WebKit::WebVector;
     47 
     48 namespace {
     49 
     50 // Iterate recursively over sub-frames to find one with with a given url.
     51 WebFrame* FindSubFrameByURL(WebView* web_view, const GURL& url) {
     52   if (!web_view->mainFrame())
     53     return NULL;
     54 
     55   std::vector<WebFrame*> stack;
     56   stack.push_back(web_view->mainFrame());
     57 
     58   while (!stack.empty()) {
     59     WebFrame* current_frame = stack.back();
     60     stack.pop_back();
     61     if (GURL(current_frame->url()) == url)
     62       return current_frame;
     63     WebNodeCollection all = current_frame->document().all();
     64     for (WebNode node = all.firstItem();
     65          !node.isNull(); node = all.nextItem()) {
     66       if (!node.isElementNode())
     67         continue;
     68       // Check frame tag and iframe tag
     69       WebElement element = node.to<WebElement>();
     70       if (!element.hasTagName("frame") && !element.hasTagName("iframe"))
     71         continue;
     72       WebFrame* sub_frame = WebFrame::fromFrameOwnerElement(element);
     73       if (sub_frame)
     74         stack.push_back(sub_frame);
     75     }
     76   }
     77   return NULL;
     78 }
     79 
     80 class DomSerializerTests : public TestShellTest,
     81                            public WebPageSerializerClient {
     82  public:
     83   DomSerializerTests()
     84     : local_directory_name_(FILE_PATH_LITERAL("./dummy_files/")) { }
     85 
     86   // DomSerializerDelegate.
     87   void didSerializeDataForFrame(const WebURL& frame_web_url,
     88                                 const WebCString& data,
     89                                 PageSerializationStatus status) {
     90 
     91     GURL frame_url(frame_web_url);
     92     // If the all frames are finished saving, check all finish status
     93     if (status == WebPageSerializerClient::AllFramesAreFinished) {
     94       SerializationFinishStatusMap::iterator it =
     95           serialization_finish_status_.begin();
     96       for (; it != serialization_finish_status_.end(); ++it)
     97         ASSERT_TRUE(it->second);
     98       serialized_ = true;
     99       return;
    100     }
    101 
    102     // Check finish status of current frame.
    103     SerializationFinishStatusMap::iterator it =
    104         serialization_finish_status_.find(frame_url.spec());
    105     // New frame, set initial status as false.
    106     if (it == serialization_finish_status_.end())
    107       serialization_finish_status_[frame_url.spec()] = false;
    108 
    109     it = serialization_finish_status_.find(frame_url.spec());
    110     ASSERT_TRUE(it != serialization_finish_status_.end());
    111     // In process frame, finish status should be false.
    112     ASSERT_FALSE(it->second);
    113 
    114     // Add data to corresponding frame's content.
    115     serialized_frame_map_[frame_url.spec()] += data.data();
    116 
    117     // Current frame is completed saving, change the finish status.
    118     if (status == WebPageSerializerClient::CurrentFrameIsFinished)
    119       it->second = true;
    120   }
    121 
    122   bool HasSerializedFrame(const GURL& frame_url) {
    123     return serialized_frame_map_.find(frame_url.spec()) !=
    124            serialized_frame_map_.end();
    125   }
    126 
    127   const std::string& GetSerializedContentForFrame(
    128       const GURL& frame_url) {
    129     return serialized_frame_map_[frame_url.spec()];
    130   }
    131 
    132   // Load web page according to specific URL.
    133   void LoadPageFromURL(const GURL& page_url) {
    134     // Load the test file.
    135     test_shell_->ResetTestController();
    136     test_shell_->LoadURL(page_url);
    137     test_shell_->WaitTestFinished();
    138   }
    139 
    140   // Load web page according to input content and relative URLs within
    141   // the document.
    142   void LoadContents(const std::string& contents,
    143                     const GURL& base_url,
    144                     const WebString encoding_info) {
    145     test_shell_->ResetTestController();
    146     // If input encoding is empty, use UTF-8 as default encoding.
    147     if (encoding_info.isEmpty()) {
    148       test_shell_->webView()->mainFrame()->loadHTMLString(contents, base_url);
    149     } else {
    150       WebData data(contents.data(), contents.length());
    151 
    152       // Do not use WebFrame.LoadHTMLString because it assumes that input
    153       // html contents use UTF-8 encoding.
    154       // TODO(darin): This should use WebFrame::loadData.
    155       WebFrame* web_frame =
    156           test_shell_->webView()->mainFrame();
    157 
    158       ASSERT_TRUE(web_frame != NULL);
    159 
    160       web_frame->loadData(data, "text/html", encoding_info, base_url);
    161     }
    162 
    163     test_shell_->WaitTestFinished();
    164   }
    165 
    166   // Serialize page DOM according to specific page URL. The parameter
    167   // recursive_serialization indicates whether we will serialize all
    168   // sub-frames.
    169   void SerializeDomForURL(const GURL& page_url,
    170                           bool recursive_serialization) {
    171     // Find corresponding WebFrame according to page_url.
    172     WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(),
    173                                             page_url);
    174     ASSERT_TRUE(web_frame != NULL);
    175     // Add input file URl to links_.
    176     links_.assign(&page_url,1);
    177     // Add dummy file path to local_path_.
    178     WebString file_path = webkit_glue::FilePathStringToWebString(
    179         FILE_PATH_LITERAL("c:\\dummy.htm"));
    180     local_paths_.assign(&file_path, 1);
    181     // Start serializing DOM.
    182     bool result = WebPageSerializer::serialize(web_frame,
    183        recursive_serialization,
    184        static_cast<WebPageSerializerClient*>(this),
    185        links_,
    186        local_paths_,
    187        webkit_glue::FilePathToWebString(local_directory_name_));
    188     ASSERT_TRUE(result);
    189     ASSERT_TRUE(serialized_);
    190   }
    191 
    192  private:
    193   // Map frame_url to corresponding serialized_content.
    194   typedef base::hash_map<std::string, std::string> SerializedFrameContentMap;
    195   SerializedFrameContentMap serialized_frame_map_;
    196   // Map frame_url to corresponding status of serialization finish.
    197   typedef base::hash_map<std::string, bool> SerializationFinishStatusMap;
    198   SerializationFinishStatusMap serialization_finish_status_;
    199   // Flag indicates whether the process of serializing DOM is finished or not.
    200   bool serialized_;
    201   // The links_ contain dummy original URLs of all saved links.
    202   WebVector<WebURL> links_;
    203   // The local_paths_ contain dummy corresponding local file paths of all saved
    204   // links, which matched links_ one by one.
    205   WebVector<WebString> local_paths_;
    206   // The local_directory_name_ is dummy relative path of directory which
    207   // contain all saved auxiliary files included all sub frames and resources.
    208   const FilePath local_directory_name_;
    209 
    210  protected:
    211   // testing::Test
    212   virtual void SetUp() {
    213     TestShellTest::SetUp();
    214     serialized_ = false;
    215   }
    216 
    217   virtual void TearDown() {
    218     TestShellTest::TearDown();
    219   }
    220 };
    221 
    222 // Helper function that test whether the first node in the doc is a doc type
    223 // node.
    224 bool HasDocType(const WebDocument& doc) {
    225   WebNode node = doc.firstChild();
    226   if (node.isNull())
    227     return false;
    228   return node.nodeType() == WebNode::DocumentTypeNode;
    229 }
    230 
    231 // Helper function for checking whether input node is META tag. Return true
    232 // means it is META element, otherwise return false. The parameter charset_info
    233 // return actual charset info if the META tag has charset declaration.
    234 bool IsMetaElement(const WebNode& node, std::string& charset_info) {
    235   if (!node.isElementNode())
    236     return false;
    237   const WebElement meta = node.toConst<WebElement>();
    238   if (!meta.hasTagName("meta"))
    239     return false;
    240   charset_info.erase(0, charset_info.length());
    241   // Check the META charset declaration.
    242   WebString httpEquiv = meta.getAttribute("http-equiv");
    243   if (LowerCaseEqualsASCII(httpEquiv, "content-type")) {
    244     std::string content = meta.getAttribute("content").utf8();
    245     int pos = content.find("charset", 0);
    246     if (pos > -1) {
    247       // Add a dummy charset declaration to charset_info, which indicates this
    248       // META tag has charset declaration although we do not get correct value
    249       // yet.
    250       charset_info.append("has-charset-declaration");
    251       int remaining_length = content.length() - pos - 7;
    252       if (!remaining_length)
    253         return true;
    254       int start_pos = pos + 7;
    255       // Find "=" symbol.
    256       while (remaining_length--)
    257         if (content[start_pos++] == L'=')
    258           break;
    259       // Skip beginning space.
    260       while (remaining_length) {
    261         if (content[start_pos] > 0x0020)
    262           break;
    263         ++start_pos;
    264         --remaining_length;
    265       }
    266       if (!remaining_length)
    267         return true;
    268       int end_pos = start_pos;
    269       // Now we find out the start point of charset info. Search the end point.
    270       while (remaining_length--) {
    271         if (content[end_pos] <= 0x0020 || content[end_pos] == L';')
    272           break;
    273         ++end_pos;
    274       }
    275       // Get actual charset info.
    276       charset_info = content.substr(start_pos, end_pos - start_pos);
    277       return true;
    278     }
    279   }
    280   return true;
    281 }
    282 
    283 // If original contents have document type, the serialized contents also have
    284 // document type.
    285 TEST_F(DomSerializerTests, SerializeHTMLDOMWithDocType) {
    286   FilePath page_file_path = data_dir_;
    287   page_file_path = page_file_path.AppendASCII("dom_serializer");
    288   page_file_path = page_file_path.AppendASCII("youtube_1.htm");
    289   GURL file_url = net::FilePathToFileURL(page_file_path);
    290   ASSERT_TRUE(file_url.SchemeIsFile());
    291   // Load the test file.
    292   LoadPageFromURL(file_url);
    293   // Make sure original contents have document type.
    294   WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
    295   ASSERT_TRUE(web_frame != NULL);
    296   WebDocument doc = web_frame->document();
    297   ASSERT_TRUE(HasDocType(doc));
    298   // Do serialization.
    299   SerializeDomForURL(file_url, false);
    300   // Load the serialized contents.
    301   ASSERT_TRUE(HasSerializedFrame(file_url));
    302   const std::string& serialized_contents =
    303       GetSerializedContentForFrame(file_url);
    304   LoadContents(serialized_contents, file_url,
    305                web_frame->encoding());
    306   // Make sure serialized contents still have document type.
    307   web_frame = test_shell_->webView()->mainFrame();
    308   doc = web_frame->document();
    309   ASSERT_TRUE(HasDocType(doc));
    310 }
    311 
    312 // If original contents do not have document type, the serialized contents
    313 // also do not have document type.
    314 TEST_F(DomSerializerTests, SerializeHTMLDOMWithoutDocType) {
    315   FilePath page_file_path = data_dir_;
    316   page_file_path = page_file_path.AppendASCII("dom_serializer");
    317   page_file_path = page_file_path.AppendASCII("youtube_2.htm");
    318   GURL file_url = net::FilePathToFileURL(page_file_path);
    319   ASSERT_TRUE(file_url.SchemeIsFile());
    320   // Load the test file.
    321   LoadPageFromURL(file_url);
    322   // Make sure original contents do not have document type.
    323   WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
    324   ASSERT_TRUE(web_frame != NULL);
    325   WebDocument doc = web_frame->document();
    326   ASSERT_TRUE(!HasDocType(doc));
    327   // Do serialization.
    328   SerializeDomForURL(file_url, false);
    329   // Load the serialized contents.
    330   ASSERT_TRUE(HasSerializedFrame(file_url));
    331   const std::string& serialized_contents =
    332       GetSerializedContentForFrame(file_url);
    333   LoadContents(serialized_contents, file_url,
    334                web_frame->encoding());
    335   // Make sure serialized contents do not have document type.
    336   web_frame = test_shell_->webView()->mainFrame();
    337   doc = web_frame->document();
    338   ASSERT_TRUE(!HasDocType(doc));
    339 }
    340 
    341 // Serialize XML document which has all 5 built-in entities. After
    342 // finishing serialization, the serialized contents should be same
    343 // with original XML document.
    344 TEST_F(DomSerializerTests, SerializeXMLDocWithBuiltInEntities) {
    345   FilePath page_file_path = data_dir_;
    346   page_file_path = page_file_path.AppendASCII("dom_serializer");
    347   page_file_path = page_file_path.AppendASCII("note.xml");
    348   // Read original contents for later comparison.
    349   std::string original_contents;
    350   ASSERT_TRUE(file_util::ReadFileToString(page_file_path, &original_contents));
    351   // Get file URL.
    352   GURL file_url = net::FilePathToFileURL(page_file_path);
    353   ASSERT_TRUE(file_url.SchemeIsFile());
    354   // Load the test file.
    355   LoadPageFromURL(file_url);
    356   // Do serialization.
    357   SerializeDomForURL(file_url, false);
    358   // Compare the serialized contents with original contents.
    359   ASSERT_TRUE(HasSerializedFrame(file_url));
    360   const std::string& serialized_contents =
    361       GetSerializedContentForFrame(file_url);
    362   ASSERT_EQ(original_contents, serialized_contents);
    363 }
    364 
    365 // When serializing DOM, we add MOTW declaration before html tag.
    366 TEST_F(DomSerializerTests, SerializeHTMLDOMWithAddingMOTW) {
    367   FilePath page_file_path = data_dir_;
    368   page_file_path = page_file_path.AppendASCII("dom_serializer");
    369   page_file_path = page_file_path.AppendASCII("youtube_2.htm");
    370   // Read original contents for later comparison .
    371   std::string original_contents;
    372   ASSERT_TRUE(file_util::ReadFileToString(page_file_path, &original_contents));
    373   // Get file URL.
    374   GURL file_url = net::FilePathToFileURL(page_file_path);
    375   ASSERT_TRUE(file_url.SchemeIsFile());
    376   // Make sure original contents does not have MOTW;
    377   std::string motw_declaration =
    378      WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
    379   ASSERT_FALSE(motw_declaration.empty());
    380   // The encoding of original contents is ISO-8859-1, so we convert the MOTW
    381   // declaration to ASCII and search whether original contents has it or not.
    382   ASSERT_TRUE(std::string::npos ==
    383       original_contents.find(motw_declaration));
    384   // Load the test file.
    385   LoadPageFromURL(file_url);
    386   // Do serialization.
    387   SerializeDomForURL(file_url, false);
    388   // Make sure the serialized contents have MOTW ;
    389   ASSERT_TRUE(HasSerializedFrame(file_url));
    390   const std::string& serialized_contents =
    391       GetSerializedContentForFrame(file_url);
    392   ASSERT_FALSE(std::string::npos ==
    393       serialized_contents.find(motw_declaration));
    394 }
    395 
    396 // When serializing DOM, we will add the META which have correct charset
    397 // declaration as first child of HEAD element for resolving WebKit bug:
    398 // http://bugs.webkit.org/show_bug.cgi?id=16621 even the original document
    399 // does not have META charset declaration.
    400 TEST_F(DomSerializerTests, SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc) {
    401   FilePath page_file_path = data_dir_;
    402   page_file_path = page_file_path.AppendASCII("dom_serializer");
    403   page_file_path = page_file_path.AppendASCII("youtube_1.htm");
    404   // Get file URL.
    405   GURL file_url = net::FilePathToFileURL(page_file_path);
    406   ASSERT_TRUE(file_url.SchemeIsFile());
    407   // Load the test file.
    408   LoadPageFromURL(file_url);
    409 
    410   // Make sure there is no META charset declaration in original document.
    411   WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
    412   ASSERT_TRUE(web_frame != NULL);
    413   WebDocument doc = web_frame->document();
    414   ASSERT_TRUE(doc.isHTMLDocument());
    415   WebElement head_element = doc.head();
    416   ASSERT_TRUE(!head_element.isNull());
    417   // Go through all children of HEAD element.
    418   for (WebNode child = head_element.firstChild(); !child.isNull();
    419        child = child.nextSibling()) {
    420     std::string charset_info;
    421     if (IsMetaElement(child, charset_info))
    422       ASSERT_TRUE(charset_info.empty());
    423   }
    424   // Do serialization.
    425   SerializeDomForURL(file_url, false);
    426 
    427   // Load the serialized contents.
    428   ASSERT_TRUE(HasSerializedFrame(file_url));
    429   const std::string& serialized_contents =
    430       GetSerializedContentForFrame(file_url);
    431   LoadContents(serialized_contents, file_url,
    432                web_frame->encoding());
    433   // Make sure the first child of HEAD element is META which has charset
    434   // declaration in serialized contents.
    435   web_frame = test_shell_->webView()->mainFrame();
    436   ASSERT_TRUE(web_frame != NULL);
    437   doc = web_frame->document();
    438   ASSERT_TRUE(doc.isHTMLDocument());
    439   head_element = doc.head();
    440   ASSERT_TRUE(!head_element.isNull());
    441   WebNode meta_node = head_element.firstChild();
    442   ASSERT_TRUE(!meta_node.isNull());
    443   // Get meta charset info.
    444   std::string charset_info2;
    445   ASSERT_TRUE(IsMetaElement(meta_node, charset_info2));
    446   ASSERT_TRUE(!charset_info2.empty());
    447   ASSERT_TRUE(charset_info2 == std::string(web_frame->encoding().utf8()));
    448 
    449   // Make sure no more additional META tags which have charset declaration.
    450   for (WebNode child = meta_node.nextSibling(); !child.isNull();
    451        child = child.nextSibling()) {
    452     std::string charset_info;
    453     if (IsMetaElement(child, charset_info))
    454       ASSERT_TRUE(charset_info.empty());
    455   }
    456 }
    457 
    458 // When serializing DOM, if the original document has multiple META charset
    459 // declaration, we will add the META which have correct charset declaration
    460 // as first child of HEAD element and remove all original META charset
    461 // declarations.
    462 TEST_F(DomSerializerTests,
    463        SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDoc) {
    464   FilePath page_file_path = data_dir_;
    465   page_file_path = page_file_path.AppendASCII("dom_serializer");
    466   page_file_path = page_file_path.AppendASCII("youtube_2.htm");
    467   // Get file URL.
    468   GURL file_url = net::FilePathToFileURL(page_file_path);
    469   ASSERT_TRUE(file_url.SchemeIsFile());
    470   // Load the test file.
    471   LoadPageFromURL(file_url);
    472 
    473   // Make sure there are multiple META charset declarations in original
    474   // document.
    475   WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
    476   ASSERT_TRUE(web_frame != NULL);
    477   WebDocument doc = web_frame->document();
    478   ASSERT_TRUE(doc.isHTMLDocument());
    479   WebElement head_ele = doc.head();
    480   ASSERT_TRUE(!head_ele.isNull());
    481   // Go through all children of HEAD element.
    482   int charset_declaration_count = 0;
    483   for (WebNode child = head_ele.firstChild(); !child.isNull();
    484        child = child.nextSibling()) {
    485     std::string charset_info;
    486     if (IsMetaElement(child, charset_info) && !charset_info.empty())
    487       charset_declaration_count++;
    488   }
    489   // The original doc has more than META tags which have charset declaration.
    490   ASSERT_TRUE(charset_declaration_count > 1);
    491 
    492   // Do serialization.
    493   SerializeDomForURL(file_url, false);
    494 
    495   // Load the serialized contents.
    496   ASSERT_TRUE(HasSerializedFrame(file_url));
    497   const std::string& serialized_contents =
    498       GetSerializedContentForFrame(file_url);
    499   LoadContents(serialized_contents, file_url,
    500                web_frame->encoding());
    501   // Make sure only first child of HEAD element is META which has charset
    502   // declaration in serialized contents.
    503   web_frame = test_shell_->webView()->mainFrame();
    504   ASSERT_TRUE(web_frame != NULL);
    505   doc = web_frame->document();
    506   ASSERT_TRUE(doc.isHTMLDocument());
    507   head_ele = doc.head();
    508   ASSERT_TRUE(!head_ele.isNull());
    509   WebNode meta_node = head_ele.firstChild();
    510   ASSERT_TRUE(!meta_node.isNull());
    511   // Get meta charset info.
    512   std::string charset_info2;
    513   ASSERT_TRUE(IsMetaElement(meta_node, charset_info2));
    514   ASSERT_TRUE(!charset_info2.empty());
    515   ASSERT_TRUE(charset_info2 == std::string(web_frame->encoding().utf8()));
    516 
    517   // Make sure no more additional META tags which have charset declaration.
    518   for (WebNode child = meta_node.nextSibling(); !child.isNull();
    519        child = child.nextSibling()) {
    520     std::string charset_info;
    521     if (IsMetaElement(child, charset_info))
    522       ASSERT_TRUE(charset_info.empty());
    523   }
    524 }
    525 
    526 // Test situation of html entities in text when serializing HTML DOM.
    527 TEST_F(DomSerializerTests, SerializeHTMLDOMWithEntitiesInText) {
    528   FilePath page_file_path = data_dir_;
    529   page_file_path = page_file_path.AppendASCII(
    530       "dom_serializer/htmlentities_in_text.htm");
    531   // Get file URL. The URL is dummy URL to identify the following loading
    532   // actions. The test content is in constant:original_contents.
    533   GURL file_url = net::FilePathToFileURL(page_file_path);
    534   ASSERT_TRUE(file_url.SchemeIsFile());
    535   // Test contents.
    536   static const char* const original_contents =
    537       "<html><body>&amp;&lt;&gt;\"\'</body></html>";
    538   // Load the test contents.
    539   LoadContents(original_contents, file_url, WebString());
    540 
    541   // Get BODY's text content in DOM.
    542   WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
    543   ASSERT_TRUE(web_frame != NULL);
    544   WebDocument doc = web_frame->document();
    545   ASSERT_TRUE(doc.isHTMLDocument());
    546   WebElement body_ele = doc.body();
    547   ASSERT_TRUE(!body_ele.isNull());
    548   WebNode text_node = body_ele.firstChild();
    549   ASSERT_TRUE(text_node.isTextNode());
    550   ASSERT_TRUE(std::string(text_node.createMarkup().utf8()) ==
    551               "&amp;&lt;&gt;\"\'");
    552   // Do serialization.
    553   SerializeDomForURL(file_url, false);
    554   // Compare the serialized contents with original contents.
    555   ASSERT_TRUE(HasSerializedFrame(file_url));
    556   const std::string& serialized_contents =
    557       GetSerializedContentForFrame(file_url);
    558   // Compare the serialized contents with original contents to make sure
    559   // they are same.
    560   // Because we add MOTW when serializing DOM, so before comparison, we also
    561   // need to add MOTW to original_contents.
    562   std::string original_str =
    563     WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
    564   original_str += original_contents;
    565   // Since WebCore now inserts a new HEAD element if there is no HEAD element
    566   // when creating BODY element. (Please see HTMLParser::bodyCreateErrorCheck.)
    567   // We need to append the HEAD content and corresponding META content if we
    568   // find WebCore-generated HEAD element.
    569   if (!doc.head().isNull()) {
    570     WebString encoding = web_frame->encoding();
    571     std::string htmlTag("<html>");
    572     std::string::size_type pos = original_str.find(htmlTag);
    573     ASSERT_NE(std::string::npos, pos);
    574     pos += htmlTag.length();
    575     std::string head_part("<head>");
    576     head_part +=
    577         WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8();
    578     head_part += "</head>";
    579     original_str.insert(pos, head_part);
    580   }
    581   ASSERT_EQ(original_str, serialized_contents);
    582 }
    583 
    584 // Test situation of html entities in attribute value when serializing
    585 // HTML DOM.
    586 // This test started to fail at WebKit r65388. See http://crbug.com/52279.
    587 TEST_F(DomSerializerTests, SerializeHTMLDOMWithEntitiesInAttributeValue) {
    588   FilePath page_file_path = data_dir_;
    589   page_file_path = page_file_path.AppendASCII(
    590       "dom_serializer/htmlentities_in_attribute_value.htm");
    591   // Get file URL. The URL is dummy URL to identify the following loading
    592   // actions. The test content is in constant:original_contents.
    593   GURL file_url = net::FilePathToFileURL(page_file_path);
    594   ASSERT_TRUE(file_url.SchemeIsFile());
    595   // Test contents.
    596   static const char* const original_contents =
    597       "<html><body title=\"&amp;&lt;&gt;&quot;&#39;\"></body></html>";
    598   // Load the test contents.
    599   LoadContents(original_contents, file_url, WebString());
    600   // Get value of BODY's title attribute in DOM.
    601   WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
    602   ASSERT_TRUE(web_frame != NULL);
    603   WebDocument doc = web_frame->document();
    604   ASSERT_TRUE(doc.isHTMLDocument());
    605   WebElement body_ele = doc.body();
    606   ASSERT_TRUE(!body_ele.isNull());
    607   WebString value = body_ele.getAttribute("title");
    608   ASSERT_TRUE(std::string(value.utf8()) == "&<>\"\'");
    609   // Do serialization.
    610   SerializeDomForURL(file_url, false);
    611   // Compare the serialized contents with original contents.
    612   ASSERT_TRUE(HasSerializedFrame(file_url));
    613   const std::string& serialized_contents =
    614       GetSerializedContentForFrame(file_url);
    615   // Compare the serialized contents with original contents to make sure
    616   // they are same.
    617   std::string original_str =
    618       WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
    619   original_str += original_contents;
    620   if (!doc.isNull()) {
    621     WebString encoding = web_frame->encoding();
    622     std::string htmlTag("<html>");
    623     std::string::size_type pos = original_str.find(htmlTag);
    624     ASSERT_NE(std::string::npos, pos);
    625     pos += htmlTag.length();
    626     std::string head_part("<head>");
    627     head_part +=
    628         WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8();
    629     head_part += "</head>";
    630     original_str.insert(pos, head_part);
    631   }
    632   ASSERT_EQ(original_str, serialized_contents);
    633 }
    634 
    635 // Test situation of non-standard HTML entities when serializing HTML DOM.
    636 // This test started to fail at WebKit r65351. See http://crbug.com/52279.
    637 TEST_F(DomSerializerTests, SerializeHTMLDOMWithNonStandardEntities) {
    638   // Make a test file URL and load it.
    639   FilePath page_file_path = data_dir_;
    640   page_file_path = page_file_path.AppendASCII("dom_serializer");
    641   page_file_path = page_file_path.AppendASCII("nonstandard_htmlentities.htm");
    642   GURL file_url = net::FilePathToFileURL(page_file_path);
    643   LoadPageFromURL(file_url);
    644 
    645   // Get value of BODY's title attribute in DOM.
    646   WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
    647   WebDocument doc = web_frame->document();
    648   ASSERT_TRUE(doc.isHTMLDocument());
    649   WebElement body_element = doc.body();
    650   // Unescaped string for "&percnt;&nsup;&sup1;&apos;".
    651   static const wchar_t parsed_value[] = {
    652     '%', 0x2285, 0x00b9, '\'', 0
    653   };
    654   WebString value = body_element.getAttribute("title");
    655   ASSERT_TRUE(UTF16ToWide(value) == parsed_value);
    656   ASSERT_TRUE(UTF16ToWide(body_element.innerText()) == parsed_value);
    657 
    658   // Do serialization.
    659   SerializeDomForURL(file_url, false);
    660   // Check the serialized string.
    661   ASSERT_TRUE(HasSerializedFrame(file_url));
    662   const std::string& serialized_contents =
    663       GetSerializedContentForFrame(file_url);
    664   // Confirm that the serialized string has no non-standard HTML entities.
    665   ASSERT_EQ(std::string::npos, serialized_contents.find("&percnt;"));
    666   ASSERT_EQ(std::string::npos, serialized_contents.find("&nsup;"));
    667   ASSERT_EQ(std::string::npos, serialized_contents.find("&sup1;"));
    668   ASSERT_EQ(std::string::npos, serialized_contents.find("&apos;"));
    669 }
    670 
    671 // Test situation of BASE tag in original document when serializing HTML DOM.
    672 // When serializing, we should comment the BASE tag, append a new BASE tag.
    673 // rewrite all the savable URLs to relative local path, and change other URLs
    674 // to absolute URLs.
    675 TEST_F(DomSerializerTests, SerializeHTMLDOMWithBaseTag) {
    676   // There are total 2 available base tags in this test file.
    677   const int kTotalBaseTagCountInTestFile = 2;
    678 
    679   FilePath page_file_path = data_dir_.AppendASCII("dom_serializer");
    680   file_util::EnsureEndsWithSeparator(&page_file_path);
    681 
    682   // Get page dir URL which is base URL of this file.
    683   GURL path_dir_url = net::FilePathToFileURL(page_file_path);
    684   // Get file path.
    685   page_file_path =
    686       page_file_path.AppendASCII("html_doc_has_base_tag.htm");
    687   // Get file URL.
    688   GURL file_url = net::FilePathToFileURL(page_file_path);
    689   ASSERT_TRUE(file_url.SchemeIsFile());
    690   // Load the test file.
    691   LoadPageFromURL(file_url);
    692   // Since for this test, we assume there is no savable sub-resource links for
    693   // this test file, also all links are relative URLs in this test file, so we
    694   // need to check those relative URLs and make sure document has BASE tag.
    695   WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
    696   ASSERT_TRUE(web_frame != NULL);
    697   WebDocument doc = web_frame->document();
    698   ASSERT_TRUE(doc.isHTMLDocument());
    699   // Go through all descent nodes.
    700   WebNodeCollection all = doc.all();
    701   int original_base_tag_count = 0;
    702   for (WebNode node = all.firstItem(); !node.isNull();
    703        node = all.nextItem()) {
    704     if (!node.isElementNode())
    705       continue;
    706     WebElement element = node.to<WebElement>();
    707     if (element.hasTagName("base")) {
    708       original_base_tag_count++;
    709     } else {
    710       // Get link.
    711       WebString value =
    712           webkit_glue::GetSubResourceLinkFromElement(element);
    713       if (value.isNull() && element.hasTagName("a")) {
    714         value = element.getAttribute("href");
    715         if (value.isEmpty())
    716           value = WebString();
    717       }
    718       // Each link is relative link.
    719       if (!value.isNull()) {
    720         GURL link(value.utf8());
    721         ASSERT_TRUE(link.scheme().empty());
    722       }
    723     }
    724   }
    725   ASSERT_EQ(original_base_tag_count, kTotalBaseTagCountInTestFile);
    726   // Make sure in original document, the base URL is not equal with the
    727   // |path_dir_url|.
    728   GURL original_base_url(doc.baseURL());
    729   ASSERT_NE(original_base_url, path_dir_url);
    730 
    731   // Do serialization.
    732   SerializeDomForURL(file_url, false);
    733 
    734   // Load the serialized contents.
    735   ASSERT_TRUE(HasSerializedFrame(file_url));
    736   const std::string& serialized_contents =
    737       GetSerializedContentForFrame(file_url);
    738   LoadContents(serialized_contents, file_url,
    739                web_frame->encoding());
    740 
    741   // Make sure all links are absolute URLs and doc there are some number of
    742   // BASE tags in serialized HTML data. Each of those BASE tags have same base
    743   // URL which is as same as URL of current test file.
    744   web_frame = test_shell_->webView()->mainFrame();
    745   ASSERT_TRUE(web_frame != NULL);
    746   doc = web_frame->document();
    747   ASSERT_TRUE(doc.isHTMLDocument());
    748   // Go through all descent nodes.
    749   all = doc.all();
    750   int new_base_tag_count = 0;
    751   for (WebNode node = all.firstItem(); !node.isNull();
    752        node = all.nextItem()) {
    753     if (!node.isElementNode())
    754       continue;
    755     WebElement element = node.to<WebElement>();
    756     if (element.hasTagName("base")) {
    757       new_base_tag_count++;
    758     } else {
    759       // Get link.
    760       WebString value =
    761           webkit_glue::GetSubResourceLinkFromElement(element);
    762       if (value.isNull() && element.hasTagName("a")) {
    763         value = element.getAttribute("href");
    764         if (value.isEmpty())
    765           value = WebString();
    766       }
    767       // Each link is absolute link.
    768       if (!value.isNull()) {
    769         GURL link(std::string(value.utf8()));
    770         ASSERT_FALSE(link.scheme().empty());
    771       }
    772     }
    773   }
    774   // We have one more added BASE tag which is generated by JavaScript.
    775   ASSERT_EQ(new_base_tag_count, original_base_tag_count + 1);
    776   // Make sure in new document, the base URL is equal with the |path_dir_url|.
    777   GURL new_base_url(doc.baseURL());
    778   ASSERT_EQ(new_base_url, path_dir_url);
    779 }
    780 
    781 // Serializing page which has an empty HEAD tag.
    782 TEST_F(DomSerializerTests, SerializeHTMLDOMWithEmptyHead) {
    783   FilePath page_file_path = data_dir_;
    784   page_file_path = page_file_path.AppendASCII("dom_serializer");
    785   page_file_path = page_file_path.AppendASCII("empty_head.htm");
    786   GURL file_url = net::FilePathToFileURL(page_file_path);
    787   ASSERT_TRUE(file_url.SchemeIsFile());
    788 
    789   // Load the test html content.
    790   static const char* const empty_head_contents =
    791     "<html><head></head><body>hello world</body></html>";
    792   LoadContents(empty_head_contents, file_url, WebString());
    793 
    794   // Make sure the head tag is empty.
    795   WebFrame* web_frame = test_shell_->webView()->mainFrame();
    796   ASSERT_TRUE(web_frame != NULL);
    797   WebDocument doc = web_frame->document();
    798   ASSERT_TRUE(doc.isHTMLDocument());
    799   WebElement head_element = doc.head();
    800   ASSERT_TRUE(!head_element.isNull());
    801   ASSERT_TRUE(!head_element.hasChildNodes());
    802   ASSERT_TRUE(head_element.childNodes().length() == 0);
    803 
    804   // Do serialization.
    805   SerializeDomForURL(file_url, false);
    806   // Make sure the serialized contents have META ;
    807   ASSERT_TRUE(HasSerializedFrame(file_url));
    808   const std::string& serialized_contents =
    809       GetSerializedContentForFrame(file_url);
    810 
    811   // Reload serialized contents and make sure there is only one META tag.
    812   LoadContents(serialized_contents, file_url, web_frame->encoding());
    813   web_frame = test_shell_->webView()->mainFrame();
    814   ASSERT_TRUE(web_frame != NULL);
    815   doc = web_frame->document();
    816   ASSERT_TRUE(doc.isHTMLDocument());
    817   head_element = doc.head();
    818   ASSERT_TRUE(!head_element.isNull());
    819   ASSERT_TRUE(head_element.hasChildNodes());
    820   ASSERT_TRUE(head_element.childNodes().length() == 1);
    821   WebNode meta_node = head_element.firstChild();
    822   ASSERT_TRUE(!meta_node.isNull());
    823   // Get meta charset info.
    824   std::string charset_info;
    825   ASSERT_TRUE(IsMetaElement(meta_node, charset_info));
    826   ASSERT_TRUE(!charset_info.empty());
    827   ASSERT_TRUE(charset_info == std::string(web_frame->encoding().utf8()));
    828 
    829   // Check the body's first node is text node and its contents are
    830   // "hello world"
    831   WebElement body_element = doc.body();
    832   ASSERT_TRUE(!body_element.isNull());
    833   WebNode text_node = body_element.firstChild();
    834   ASSERT_TRUE(text_node.isTextNode());
    835   WebString text_node_contents = text_node.nodeValue();
    836   ASSERT_TRUE(std::string(text_node_contents.utf8()) == "hello world");
    837 }
    838 
    839 // Test that we don't crash when the page contains an iframe that
    840 // was handled as a download (http://crbug.com/42212).
    841 TEST_F(DomSerializerTests, SerializeDocumentWithDownloadedIFrame) {
    842   FilePath page_file_path = data_dir_;
    843   page_file_path = page_file_path.AppendASCII("dom_serializer");
    844   page_file_path = page_file_path.AppendASCII("iframe-src-is-exe.htm");
    845   GURL file_url = net::FilePathToFileURL(page_file_path);
    846   ASSERT_TRUE(file_url.SchemeIsFile());
    847   // Load the test file.
    848   LoadPageFromURL(file_url);
    849   // Do a recursive serialization. We pass if we don't crash.
    850   SerializeDomForURL(file_url, true);
    851 }
    852 
    853 }  // namespace
    854