1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "base/compiler_specific.h" 6 #include "base/file_path.h" 7 #include "base/file_util.h" 8 #include "base/hash_tables.h" 9 #include "base/string_util.h" 10 #include "base/utf_string_conversions.h" 11 #include "net/base/net_util.h" 12 #include "net/url_request/url_request_context.h" 13 #include "third_party/WebKit/Source/WebKit/chromium/public/WebCString.h" 14 #include "third_party/WebKit/Source/WebKit/chromium/public/WebData.h" 15 #include "third_party/WebKit/Source/WebKit/chromium/public/WebDocument.h" 16 #include "third_party/WebKit/Source/WebKit/chromium/public/WebElement.h" 17 #include "third_party/WebKit/Source/WebKit/chromium/public/WebFrame.h" 18 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNode.h" 19 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNodeCollection.h" 20 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNodeList.h" 21 #include "third_party/WebKit/Source/WebKit/chromium/public/WebPageSerializer.h" 22 #include "third_party/WebKit/Source/WebKit/chromium/public/WebPageSerializerClient.h" 23 #include "third_party/WebKit/Source/WebKit/chromium/public/WebString.h" 24 #include "third_party/WebKit/Source/WebKit/chromium/public/WebURL.h" 25 #include "third_party/WebKit/Source/WebKit/chromium/public/WebVector.h" 26 #include "third_party/WebKit/Source/WebKit/chromium/public/WebView.h" 27 #include "webkit/glue/dom_operations.h" 28 #include "webkit/glue/webkit_glue.h" 29 #include "webkit/tools/test_shell/simple_resource_loader_bridge.h" 30 #include "webkit/tools/test_shell/test_shell_test.h" 31 32 using WebKit::WebCString; 33 using WebKit::WebData; 34 using WebKit::WebDocument; 35 using WebKit::WebElement; 36 using WebKit::WebFrame; 37 using WebKit::WebNode; 38 using WebKit::WebNodeCollection; 39 using WebKit::WebNodeList; 40 using WebKit::WebPageSerializer; 41 using WebKit::WebPageSerializerClient; 42 using WebKit::WebNode; 43 using WebKit::WebString; 44 using WebKit::WebURL; 45 using WebKit::WebView; 46 using WebKit::WebVector; 47 48 namespace { 49 50 // Iterate recursively over sub-frames to find one with with a given url. 51 WebFrame* FindSubFrameByURL(WebView* web_view, const GURL& url) { 52 if (!web_view->mainFrame()) 53 return NULL; 54 55 std::vector<WebFrame*> stack; 56 stack.push_back(web_view->mainFrame()); 57 58 while (!stack.empty()) { 59 WebFrame* current_frame = stack.back(); 60 stack.pop_back(); 61 if (GURL(current_frame->url()) == url) 62 return current_frame; 63 WebNodeCollection all = current_frame->document().all(); 64 for (WebNode node = all.firstItem(); 65 !node.isNull(); node = all.nextItem()) { 66 if (!node.isElementNode()) 67 continue; 68 // Check frame tag and iframe tag 69 WebElement element = node.to<WebElement>(); 70 if (!element.hasTagName("frame") && !element.hasTagName("iframe")) 71 continue; 72 WebFrame* sub_frame = WebFrame::fromFrameOwnerElement(element); 73 if (sub_frame) 74 stack.push_back(sub_frame); 75 } 76 } 77 return NULL; 78 } 79 80 class DomSerializerTests : public TestShellTest, 81 public WebPageSerializerClient { 82 public: 83 DomSerializerTests() 84 : local_directory_name_(FILE_PATH_LITERAL("./dummy_files/")) { } 85 86 // DomSerializerDelegate. 87 void didSerializeDataForFrame(const WebURL& frame_web_url, 88 const WebCString& data, 89 PageSerializationStatus status) { 90 91 GURL frame_url(frame_web_url); 92 // If the all frames are finished saving, check all finish status 93 if (status == WebPageSerializerClient::AllFramesAreFinished) { 94 SerializationFinishStatusMap::iterator it = 95 serialization_finish_status_.begin(); 96 for (; it != serialization_finish_status_.end(); ++it) 97 ASSERT_TRUE(it->second); 98 serialized_ = true; 99 return; 100 } 101 102 // Check finish status of current frame. 103 SerializationFinishStatusMap::iterator it = 104 serialization_finish_status_.find(frame_url.spec()); 105 // New frame, set initial status as false. 106 if (it == serialization_finish_status_.end()) 107 serialization_finish_status_[frame_url.spec()] = false; 108 109 it = serialization_finish_status_.find(frame_url.spec()); 110 ASSERT_TRUE(it != serialization_finish_status_.end()); 111 // In process frame, finish status should be false. 112 ASSERT_FALSE(it->second); 113 114 // Add data to corresponding frame's content. 115 serialized_frame_map_[frame_url.spec()] += data.data(); 116 117 // Current frame is completed saving, change the finish status. 118 if (status == WebPageSerializerClient::CurrentFrameIsFinished) 119 it->second = true; 120 } 121 122 bool HasSerializedFrame(const GURL& frame_url) { 123 return serialized_frame_map_.find(frame_url.spec()) != 124 serialized_frame_map_.end(); 125 } 126 127 const std::string& GetSerializedContentForFrame( 128 const GURL& frame_url) { 129 return serialized_frame_map_[frame_url.spec()]; 130 } 131 132 // Load web page according to specific URL. 133 void LoadPageFromURL(const GURL& page_url) { 134 // Load the test file. 135 test_shell_->ResetTestController(); 136 test_shell_->LoadURL(page_url); 137 test_shell_->WaitTestFinished(); 138 } 139 140 // Load web page according to input content and relative URLs within 141 // the document. 142 void LoadContents(const std::string& contents, 143 const GURL& base_url, 144 const WebString encoding_info) { 145 test_shell_->ResetTestController(); 146 // If input encoding is empty, use UTF-8 as default encoding. 147 if (encoding_info.isEmpty()) { 148 test_shell_->webView()->mainFrame()->loadHTMLString(contents, base_url); 149 } else { 150 WebData data(contents.data(), contents.length()); 151 152 // Do not use WebFrame.LoadHTMLString because it assumes that input 153 // html contents use UTF-8 encoding. 154 // TODO(darin): This should use WebFrame::loadData. 155 WebFrame* web_frame = 156 test_shell_->webView()->mainFrame(); 157 158 ASSERT_TRUE(web_frame != NULL); 159 160 web_frame->loadData(data, "text/html", encoding_info, base_url); 161 } 162 163 test_shell_->WaitTestFinished(); 164 } 165 166 // Serialize page DOM according to specific page URL. The parameter 167 // recursive_serialization indicates whether we will serialize all 168 // sub-frames. 169 void SerializeDomForURL(const GURL& page_url, 170 bool recursive_serialization) { 171 // Find corresponding WebFrame according to page_url. 172 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), 173 page_url); 174 ASSERT_TRUE(web_frame != NULL); 175 // Add input file URl to links_. 176 links_.assign(&page_url,1); 177 // Add dummy file path to local_path_. 178 WebString file_path = webkit_glue::FilePathStringToWebString( 179 FILE_PATH_LITERAL("c:\\dummy.htm")); 180 local_paths_.assign(&file_path, 1); 181 // Start serializing DOM. 182 bool result = WebPageSerializer::serialize(web_frame, 183 recursive_serialization, 184 static_cast<WebPageSerializerClient*>(this), 185 links_, 186 local_paths_, 187 webkit_glue::FilePathToWebString(local_directory_name_)); 188 ASSERT_TRUE(result); 189 ASSERT_TRUE(serialized_); 190 } 191 192 private: 193 // Map frame_url to corresponding serialized_content. 194 typedef base::hash_map<std::string, std::string> SerializedFrameContentMap; 195 SerializedFrameContentMap serialized_frame_map_; 196 // Map frame_url to corresponding status of serialization finish. 197 typedef base::hash_map<std::string, bool> SerializationFinishStatusMap; 198 SerializationFinishStatusMap serialization_finish_status_; 199 // Flag indicates whether the process of serializing DOM is finished or not. 200 bool serialized_; 201 // The links_ contain dummy original URLs of all saved links. 202 WebVector<WebURL> links_; 203 // The local_paths_ contain dummy corresponding local file paths of all saved 204 // links, which matched links_ one by one. 205 WebVector<WebString> local_paths_; 206 // The local_directory_name_ is dummy relative path of directory which 207 // contain all saved auxiliary files included all sub frames and resources. 208 const FilePath local_directory_name_; 209 210 protected: 211 // testing::Test 212 virtual void SetUp() { 213 TestShellTest::SetUp(); 214 serialized_ = false; 215 } 216 217 virtual void TearDown() { 218 TestShellTest::TearDown(); 219 } 220 }; 221 222 // Helper function that test whether the first node in the doc is a doc type 223 // node. 224 bool HasDocType(const WebDocument& doc) { 225 WebNode node = doc.firstChild(); 226 if (node.isNull()) 227 return false; 228 return node.nodeType() == WebNode::DocumentTypeNode; 229 } 230 231 // Helper function for checking whether input node is META tag. Return true 232 // means it is META element, otherwise return false. The parameter charset_info 233 // return actual charset info if the META tag has charset declaration. 234 bool IsMetaElement(const WebNode& node, std::string& charset_info) { 235 if (!node.isElementNode()) 236 return false; 237 const WebElement meta = node.toConst<WebElement>(); 238 if (!meta.hasTagName("meta")) 239 return false; 240 charset_info.erase(0, charset_info.length()); 241 // Check the META charset declaration. 242 WebString httpEquiv = meta.getAttribute("http-equiv"); 243 if (LowerCaseEqualsASCII(httpEquiv, "content-type")) { 244 std::string content = meta.getAttribute("content").utf8(); 245 int pos = content.find("charset", 0); 246 if (pos > -1) { 247 // Add a dummy charset declaration to charset_info, which indicates this 248 // META tag has charset declaration although we do not get correct value 249 // yet. 250 charset_info.append("has-charset-declaration"); 251 int remaining_length = content.length() - pos - 7; 252 if (!remaining_length) 253 return true; 254 int start_pos = pos + 7; 255 // Find "=" symbol. 256 while (remaining_length--) 257 if (content[start_pos++] == L'=') 258 break; 259 // Skip beginning space. 260 while (remaining_length) { 261 if (content[start_pos] > 0x0020) 262 break; 263 ++start_pos; 264 --remaining_length; 265 } 266 if (!remaining_length) 267 return true; 268 int end_pos = start_pos; 269 // Now we find out the start point of charset info. Search the end point. 270 while (remaining_length--) { 271 if (content[end_pos] <= 0x0020 || content[end_pos] == L';') 272 break; 273 ++end_pos; 274 } 275 // Get actual charset info. 276 charset_info = content.substr(start_pos, end_pos - start_pos); 277 return true; 278 } 279 } 280 return true; 281 } 282 283 // If original contents have document type, the serialized contents also have 284 // document type. 285 TEST_F(DomSerializerTests, SerializeHTMLDOMWithDocType) { 286 FilePath page_file_path = data_dir_; 287 page_file_path = page_file_path.AppendASCII("dom_serializer"); 288 page_file_path = page_file_path.AppendASCII("youtube_1.htm"); 289 GURL file_url = net::FilePathToFileURL(page_file_path); 290 ASSERT_TRUE(file_url.SchemeIsFile()); 291 // Load the test file. 292 LoadPageFromURL(file_url); 293 // Make sure original contents have document type. 294 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url); 295 ASSERT_TRUE(web_frame != NULL); 296 WebDocument doc = web_frame->document(); 297 ASSERT_TRUE(HasDocType(doc)); 298 // Do serialization. 299 SerializeDomForURL(file_url, false); 300 // Load the serialized contents. 301 ASSERT_TRUE(HasSerializedFrame(file_url)); 302 const std::string& serialized_contents = 303 GetSerializedContentForFrame(file_url); 304 LoadContents(serialized_contents, file_url, 305 web_frame->encoding()); 306 // Make sure serialized contents still have document type. 307 web_frame = test_shell_->webView()->mainFrame(); 308 doc = web_frame->document(); 309 ASSERT_TRUE(HasDocType(doc)); 310 } 311 312 // If original contents do not have document type, the serialized contents 313 // also do not have document type. 314 TEST_F(DomSerializerTests, SerializeHTMLDOMWithoutDocType) { 315 FilePath page_file_path = data_dir_; 316 page_file_path = page_file_path.AppendASCII("dom_serializer"); 317 page_file_path = page_file_path.AppendASCII("youtube_2.htm"); 318 GURL file_url = net::FilePathToFileURL(page_file_path); 319 ASSERT_TRUE(file_url.SchemeIsFile()); 320 // Load the test file. 321 LoadPageFromURL(file_url); 322 // Make sure original contents do not have document type. 323 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url); 324 ASSERT_TRUE(web_frame != NULL); 325 WebDocument doc = web_frame->document(); 326 ASSERT_TRUE(!HasDocType(doc)); 327 // Do serialization. 328 SerializeDomForURL(file_url, false); 329 // Load the serialized contents. 330 ASSERT_TRUE(HasSerializedFrame(file_url)); 331 const std::string& serialized_contents = 332 GetSerializedContentForFrame(file_url); 333 LoadContents(serialized_contents, file_url, 334 web_frame->encoding()); 335 // Make sure serialized contents do not have document type. 336 web_frame = test_shell_->webView()->mainFrame(); 337 doc = web_frame->document(); 338 ASSERT_TRUE(!HasDocType(doc)); 339 } 340 341 // Serialize XML document which has all 5 built-in entities. After 342 // finishing serialization, the serialized contents should be same 343 // with original XML document. 344 TEST_F(DomSerializerTests, SerializeXMLDocWithBuiltInEntities) { 345 FilePath page_file_path = data_dir_; 346 page_file_path = page_file_path.AppendASCII("dom_serializer"); 347 page_file_path = page_file_path.AppendASCII("note.xml"); 348 // Read original contents for later comparison. 349 std::string original_contents; 350 ASSERT_TRUE(file_util::ReadFileToString(page_file_path, &original_contents)); 351 // Get file URL. 352 GURL file_url = net::FilePathToFileURL(page_file_path); 353 ASSERT_TRUE(file_url.SchemeIsFile()); 354 // Load the test file. 355 LoadPageFromURL(file_url); 356 // Do serialization. 357 SerializeDomForURL(file_url, false); 358 // Compare the serialized contents with original contents. 359 ASSERT_TRUE(HasSerializedFrame(file_url)); 360 const std::string& serialized_contents = 361 GetSerializedContentForFrame(file_url); 362 ASSERT_EQ(original_contents, serialized_contents); 363 } 364 365 // When serializing DOM, we add MOTW declaration before html tag. 366 TEST_F(DomSerializerTests, SerializeHTMLDOMWithAddingMOTW) { 367 FilePath page_file_path = data_dir_; 368 page_file_path = page_file_path.AppendASCII("dom_serializer"); 369 page_file_path = page_file_path.AppendASCII("youtube_2.htm"); 370 // Read original contents for later comparison . 371 std::string original_contents; 372 ASSERT_TRUE(file_util::ReadFileToString(page_file_path, &original_contents)); 373 // Get file URL. 374 GURL file_url = net::FilePathToFileURL(page_file_path); 375 ASSERT_TRUE(file_url.SchemeIsFile()); 376 // Make sure original contents does not have MOTW; 377 std::string motw_declaration = 378 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8(); 379 ASSERT_FALSE(motw_declaration.empty()); 380 // The encoding of original contents is ISO-8859-1, so we convert the MOTW 381 // declaration to ASCII and search whether original contents has it or not. 382 ASSERT_TRUE(std::string::npos == 383 original_contents.find(motw_declaration)); 384 // Load the test file. 385 LoadPageFromURL(file_url); 386 // Do serialization. 387 SerializeDomForURL(file_url, false); 388 // Make sure the serialized contents have MOTW ; 389 ASSERT_TRUE(HasSerializedFrame(file_url)); 390 const std::string& serialized_contents = 391 GetSerializedContentForFrame(file_url); 392 ASSERT_FALSE(std::string::npos == 393 serialized_contents.find(motw_declaration)); 394 } 395 396 // When serializing DOM, we will add the META which have correct charset 397 // declaration as first child of HEAD element for resolving WebKit bug: 398 // http://bugs.webkit.org/show_bug.cgi?id=16621 even the original document 399 // does not have META charset declaration. 400 TEST_F(DomSerializerTests, SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc) { 401 FilePath page_file_path = data_dir_; 402 page_file_path = page_file_path.AppendASCII("dom_serializer"); 403 page_file_path = page_file_path.AppendASCII("youtube_1.htm"); 404 // Get file URL. 405 GURL file_url = net::FilePathToFileURL(page_file_path); 406 ASSERT_TRUE(file_url.SchemeIsFile()); 407 // Load the test file. 408 LoadPageFromURL(file_url); 409 410 // Make sure there is no META charset declaration in original document. 411 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url); 412 ASSERT_TRUE(web_frame != NULL); 413 WebDocument doc = web_frame->document(); 414 ASSERT_TRUE(doc.isHTMLDocument()); 415 WebElement head_element = doc.head(); 416 ASSERT_TRUE(!head_element.isNull()); 417 // Go through all children of HEAD element. 418 for (WebNode child = head_element.firstChild(); !child.isNull(); 419 child = child.nextSibling()) { 420 std::string charset_info; 421 if (IsMetaElement(child, charset_info)) 422 ASSERT_TRUE(charset_info.empty()); 423 } 424 // Do serialization. 425 SerializeDomForURL(file_url, false); 426 427 // Load the serialized contents. 428 ASSERT_TRUE(HasSerializedFrame(file_url)); 429 const std::string& serialized_contents = 430 GetSerializedContentForFrame(file_url); 431 LoadContents(serialized_contents, file_url, 432 web_frame->encoding()); 433 // Make sure the first child of HEAD element is META which has charset 434 // declaration in serialized contents. 435 web_frame = test_shell_->webView()->mainFrame(); 436 ASSERT_TRUE(web_frame != NULL); 437 doc = web_frame->document(); 438 ASSERT_TRUE(doc.isHTMLDocument()); 439 head_element = doc.head(); 440 ASSERT_TRUE(!head_element.isNull()); 441 WebNode meta_node = head_element.firstChild(); 442 ASSERT_TRUE(!meta_node.isNull()); 443 // Get meta charset info. 444 std::string charset_info2; 445 ASSERT_TRUE(IsMetaElement(meta_node, charset_info2)); 446 ASSERT_TRUE(!charset_info2.empty()); 447 ASSERT_TRUE(charset_info2 == std::string(web_frame->encoding().utf8())); 448 449 // Make sure no more additional META tags which have charset declaration. 450 for (WebNode child = meta_node.nextSibling(); !child.isNull(); 451 child = child.nextSibling()) { 452 std::string charset_info; 453 if (IsMetaElement(child, charset_info)) 454 ASSERT_TRUE(charset_info.empty()); 455 } 456 } 457 458 // When serializing DOM, if the original document has multiple META charset 459 // declaration, we will add the META which have correct charset declaration 460 // as first child of HEAD element and remove all original META charset 461 // declarations. 462 TEST_F(DomSerializerTests, 463 SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDoc) { 464 FilePath page_file_path = data_dir_; 465 page_file_path = page_file_path.AppendASCII("dom_serializer"); 466 page_file_path = page_file_path.AppendASCII("youtube_2.htm"); 467 // Get file URL. 468 GURL file_url = net::FilePathToFileURL(page_file_path); 469 ASSERT_TRUE(file_url.SchemeIsFile()); 470 // Load the test file. 471 LoadPageFromURL(file_url); 472 473 // Make sure there are multiple META charset declarations in original 474 // document. 475 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url); 476 ASSERT_TRUE(web_frame != NULL); 477 WebDocument doc = web_frame->document(); 478 ASSERT_TRUE(doc.isHTMLDocument()); 479 WebElement head_ele = doc.head(); 480 ASSERT_TRUE(!head_ele.isNull()); 481 // Go through all children of HEAD element. 482 int charset_declaration_count = 0; 483 for (WebNode child = head_ele.firstChild(); !child.isNull(); 484 child = child.nextSibling()) { 485 std::string charset_info; 486 if (IsMetaElement(child, charset_info) && !charset_info.empty()) 487 charset_declaration_count++; 488 } 489 // The original doc has more than META tags which have charset declaration. 490 ASSERT_TRUE(charset_declaration_count > 1); 491 492 // Do serialization. 493 SerializeDomForURL(file_url, false); 494 495 // Load the serialized contents. 496 ASSERT_TRUE(HasSerializedFrame(file_url)); 497 const std::string& serialized_contents = 498 GetSerializedContentForFrame(file_url); 499 LoadContents(serialized_contents, file_url, 500 web_frame->encoding()); 501 // Make sure only first child of HEAD element is META which has charset 502 // declaration in serialized contents. 503 web_frame = test_shell_->webView()->mainFrame(); 504 ASSERT_TRUE(web_frame != NULL); 505 doc = web_frame->document(); 506 ASSERT_TRUE(doc.isHTMLDocument()); 507 head_ele = doc.head(); 508 ASSERT_TRUE(!head_ele.isNull()); 509 WebNode meta_node = head_ele.firstChild(); 510 ASSERT_TRUE(!meta_node.isNull()); 511 // Get meta charset info. 512 std::string charset_info2; 513 ASSERT_TRUE(IsMetaElement(meta_node, charset_info2)); 514 ASSERT_TRUE(!charset_info2.empty()); 515 ASSERT_TRUE(charset_info2 == std::string(web_frame->encoding().utf8())); 516 517 // Make sure no more additional META tags which have charset declaration. 518 for (WebNode child = meta_node.nextSibling(); !child.isNull(); 519 child = child.nextSibling()) { 520 std::string charset_info; 521 if (IsMetaElement(child, charset_info)) 522 ASSERT_TRUE(charset_info.empty()); 523 } 524 } 525 526 // Test situation of html entities in text when serializing HTML DOM. 527 TEST_F(DomSerializerTests, SerializeHTMLDOMWithEntitiesInText) { 528 FilePath page_file_path = data_dir_; 529 page_file_path = page_file_path.AppendASCII( 530 "dom_serializer/htmlentities_in_text.htm"); 531 // Get file URL. The URL is dummy URL to identify the following loading 532 // actions. The test content is in constant:original_contents. 533 GURL file_url = net::FilePathToFileURL(page_file_path); 534 ASSERT_TRUE(file_url.SchemeIsFile()); 535 // Test contents. 536 static const char* const original_contents = 537 "<html><body>&<>\"\'</body></html>"; 538 // Load the test contents. 539 LoadContents(original_contents, file_url, WebString()); 540 541 // Get BODY's text content in DOM. 542 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url); 543 ASSERT_TRUE(web_frame != NULL); 544 WebDocument doc = web_frame->document(); 545 ASSERT_TRUE(doc.isHTMLDocument()); 546 WebElement body_ele = doc.body(); 547 ASSERT_TRUE(!body_ele.isNull()); 548 WebNode text_node = body_ele.firstChild(); 549 ASSERT_TRUE(text_node.isTextNode()); 550 ASSERT_TRUE(std::string(text_node.createMarkup().utf8()) == 551 "&<>\"\'"); 552 // Do serialization. 553 SerializeDomForURL(file_url, false); 554 // Compare the serialized contents with original contents. 555 ASSERT_TRUE(HasSerializedFrame(file_url)); 556 const std::string& serialized_contents = 557 GetSerializedContentForFrame(file_url); 558 // Compare the serialized contents with original contents to make sure 559 // they are same. 560 // Because we add MOTW when serializing DOM, so before comparison, we also 561 // need to add MOTW to original_contents. 562 std::string original_str = 563 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8(); 564 original_str += original_contents; 565 // Since WebCore now inserts a new HEAD element if there is no HEAD element 566 // when creating BODY element. (Please see HTMLParser::bodyCreateErrorCheck.) 567 // We need to append the HEAD content and corresponding META content if we 568 // find WebCore-generated HEAD element. 569 if (!doc.head().isNull()) { 570 WebString encoding = web_frame->encoding(); 571 std::string htmlTag("<html>"); 572 std::string::size_type pos = original_str.find(htmlTag); 573 ASSERT_NE(std::string::npos, pos); 574 pos += htmlTag.length(); 575 std::string head_part("<head>"); 576 head_part += 577 WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8(); 578 head_part += "</head>"; 579 original_str.insert(pos, head_part); 580 } 581 ASSERT_EQ(original_str, serialized_contents); 582 } 583 584 // Test situation of html entities in attribute value when serializing 585 // HTML DOM. 586 // This test started to fail at WebKit r65388. See http://crbug.com/52279. 587 TEST_F(DomSerializerTests, SerializeHTMLDOMWithEntitiesInAttributeValue) { 588 FilePath page_file_path = data_dir_; 589 page_file_path = page_file_path.AppendASCII( 590 "dom_serializer/htmlentities_in_attribute_value.htm"); 591 // Get file URL. The URL is dummy URL to identify the following loading 592 // actions. The test content is in constant:original_contents. 593 GURL file_url = net::FilePathToFileURL(page_file_path); 594 ASSERT_TRUE(file_url.SchemeIsFile()); 595 // Test contents. 596 static const char* const original_contents = 597 "<html><body title=\"&<>"'\"></body></html>"; 598 // Load the test contents. 599 LoadContents(original_contents, file_url, WebString()); 600 // Get value of BODY's title attribute in DOM. 601 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url); 602 ASSERT_TRUE(web_frame != NULL); 603 WebDocument doc = web_frame->document(); 604 ASSERT_TRUE(doc.isHTMLDocument()); 605 WebElement body_ele = doc.body(); 606 ASSERT_TRUE(!body_ele.isNull()); 607 WebString value = body_ele.getAttribute("title"); 608 ASSERT_TRUE(std::string(value.utf8()) == "&<>\"\'"); 609 // Do serialization. 610 SerializeDomForURL(file_url, false); 611 // Compare the serialized contents with original contents. 612 ASSERT_TRUE(HasSerializedFrame(file_url)); 613 const std::string& serialized_contents = 614 GetSerializedContentForFrame(file_url); 615 // Compare the serialized contents with original contents to make sure 616 // they are same. 617 std::string original_str = 618 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8(); 619 original_str += original_contents; 620 if (!doc.isNull()) { 621 WebString encoding = web_frame->encoding(); 622 std::string htmlTag("<html>"); 623 std::string::size_type pos = original_str.find(htmlTag); 624 ASSERT_NE(std::string::npos, pos); 625 pos += htmlTag.length(); 626 std::string head_part("<head>"); 627 head_part += 628 WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8(); 629 head_part += "</head>"; 630 original_str.insert(pos, head_part); 631 } 632 ASSERT_EQ(original_str, serialized_contents); 633 } 634 635 // Test situation of non-standard HTML entities when serializing HTML DOM. 636 // This test started to fail at WebKit r65351. See http://crbug.com/52279. 637 TEST_F(DomSerializerTests, SerializeHTMLDOMWithNonStandardEntities) { 638 // Make a test file URL and load it. 639 FilePath page_file_path = data_dir_; 640 page_file_path = page_file_path.AppendASCII("dom_serializer"); 641 page_file_path = page_file_path.AppendASCII("nonstandard_htmlentities.htm"); 642 GURL file_url = net::FilePathToFileURL(page_file_path); 643 LoadPageFromURL(file_url); 644 645 // Get value of BODY's title attribute in DOM. 646 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url); 647 WebDocument doc = web_frame->document(); 648 ASSERT_TRUE(doc.isHTMLDocument()); 649 WebElement body_element = doc.body(); 650 // Unescaped string for "%⊅¹'". 651 static const wchar_t parsed_value[] = { 652 '%', 0x2285, 0x00b9, '\'', 0 653 }; 654 WebString value = body_element.getAttribute("title"); 655 ASSERT_TRUE(UTF16ToWide(value) == parsed_value); 656 ASSERT_TRUE(UTF16ToWide(body_element.innerText()) == parsed_value); 657 658 // Do serialization. 659 SerializeDomForURL(file_url, false); 660 // Check the serialized string. 661 ASSERT_TRUE(HasSerializedFrame(file_url)); 662 const std::string& serialized_contents = 663 GetSerializedContentForFrame(file_url); 664 // Confirm that the serialized string has no non-standard HTML entities. 665 ASSERT_EQ(std::string::npos, serialized_contents.find("%")); 666 ASSERT_EQ(std::string::npos, serialized_contents.find("⊅")); 667 ASSERT_EQ(std::string::npos, serialized_contents.find("¹")); 668 ASSERT_EQ(std::string::npos, serialized_contents.find("'")); 669 } 670 671 // Test situation of BASE tag in original document when serializing HTML DOM. 672 // When serializing, we should comment the BASE tag, append a new BASE tag. 673 // rewrite all the savable URLs to relative local path, and change other URLs 674 // to absolute URLs. 675 TEST_F(DomSerializerTests, SerializeHTMLDOMWithBaseTag) { 676 // There are total 2 available base tags in this test file. 677 const int kTotalBaseTagCountInTestFile = 2; 678 679 FilePath page_file_path = data_dir_.AppendASCII("dom_serializer"); 680 file_util::EnsureEndsWithSeparator(&page_file_path); 681 682 // Get page dir URL which is base URL of this file. 683 GURL path_dir_url = net::FilePathToFileURL(page_file_path); 684 // Get file path. 685 page_file_path = 686 page_file_path.AppendASCII("html_doc_has_base_tag.htm"); 687 // Get file URL. 688 GURL file_url = net::FilePathToFileURL(page_file_path); 689 ASSERT_TRUE(file_url.SchemeIsFile()); 690 // Load the test file. 691 LoadPageFromURL(file_url); 692 // Since for this test, we assume there is no savable sub-resource links for 693 // this test file, also all links are relative URLs in this test file, so we 694 // need to check those relative URLs and make sure document has BASE tag. 695 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url); 696 ASSERT_TRUE(web_frame != NULL); 697 WebDocument doc = web_frame->document(); 698 ASSERT_TRUE(doc.isHTMLDocument()); 699 // Go through all descent nodes. 700 WebNodeCollection all = doc.all(); 701 int original_base_tag_count = 0; 702 for (WebNode node = all.firstItem(); !node.isNull(); 703 node = all.nextItem()) { 704 if (!node.isElementNode()) 705 continue; 706 WebElement element = node.to<WebElement>(); 707 if (element.hasTagName("base")) { 708 original_base_tag_count++; 709 } else { 710 // Get link. 711 WebString value = 712 webkit_glue::GetSubResourceLinkFromElement(element); 713 if (value.isNull() && element.hasTagName("a")) { 714 value = element.getAttribute("href"); 715 if (value.isEmpty()) 716 value = WebString(); 717 } 718 // Each link is relative link. 719 if (!value.isNull()) { 720 GURL link(value.utf8()); 721 ASSERT_TRUE(link.scheme().empty()); 722 } 723 } 724 } 725 ASSERT_EQ(original_base_tag_count, kTotalBaseTagCountInTestFile); 726 // Make sure in original document, the base URL is not equal with the 727 // |path_dir_url|. 728 GURL original_base_url(doc.baseURL()); 729 ASSERT_NE(original_base_url, path_dir_url); 730 731 // Do serialization. 732 SerializeDomForURL(file_url, false); 733 734 // Load the serialized contents. 735 ASSERT_TRUE(HasSerializedFrame(file_url)); 736 const std::string& serialized_contents = 737 GetSerializedContentForFrame(file_url); 738 LoadContents(serialized_contents, file_url, 739 web_frame->encoding()); 740 741 // Make sure all links are absolute URLs and doc there are some number of 742 // BASE tags in serialized HTML data. Each of those BASE tags have same base 743 // URL which is as same as URL of current test file. 744 web_frame = test_shell_->webView()->mainFrame(); 745 ASSERT_TRUE(web_frame != NULL); 746 doc = web_frame->document(); 747 ASSERT_TRUE(doc.isHTMLDocument()); 748 // Go through all descent nodes. 749 all = doc.all(); 750 int new_base_tag_count = 0; 751 for (WebNode node = all.firstItem(); !node.isNull(); 752 node = all.nextItem()) { 753 if (!node.isElementNode()) 754 continue; 755 WebElement element = node.to<WebElement>(); 756 if (element.hasTagName("base")) { 757 new_base_tag_count++; 758 } else { 759 // Get link. 760 WebString value = 761 webkit_glue::GetSubResourceLinkFromElement(element); 762 if (value.isNull() && element.hasTagName("a")) { 763 value = element.getAttribute("href"); 764 if (value.isEmpty()) 765 value = WebString(); 766 } 767 // Each link is absolute link. 768 if (!value.isNull()) { 769 GURL link(std::string(value.utf8())); 770 ASSERT_FALSE(link.scheme().empty()); 771 } 772 } 773 } 774 // We have one more added BASE tag which is generated by JavaScript. 775 ASSERT_EQ(new_base_tag_count, original_base_tag_count + 1); 776 // Make sure in new document, the base URL is equal with the |path_dir_url|. 777 GURL new_base_url(doc.baseURL()); 778 ASSERT_EQ(new_base_url, path_dir_url); 779 } 780 781 // Serializing page which has an empty HEAD tag. 782 TEST_F(DomSerializerTests, SerializeHTMLDOMWithEmptyHead) { 783 FilePath page_file_path = data_dir_; 784 page_file_path = page_file_path.AppendASCII("dom_serializer"); 785 page_file_path = page_file_path.AppendASCII("empty_head.htm"); 786 GURL file_url = net::FilePathToFileURL(page_file_path); 787 ASSERT_TRUE(file_url.SchemeIsFile()); 788 789 // Load the test html content. 790 static const char* const empty_head_contents = 791 "<html><head></head><body>hello world</body></html>"; 792 LoadContents(empty_head_contents, file_url, WebString()); 793 794 // Make sure the head tag is empty. 795 WebFrame* web_frame = test_shell_->webView()->mainFrame(); 796 ASSERT_TRUE(web_frame != NULL); 797 WebDocument doc = web_frame->document(); 798 ASSERT_TRUE(doc.isHTMLDocument()); 799 WebElement head_element = doc.head(); 800 ASSERT_TRUE(!head_element.isNull()); 801 ASSERT_TRUE(!head_element.hasChildNodes()); 802 ASSERT_TRUE(head_element.childNodes().length() == 0); 803 804 // Do serialization. 805 SerializeDomForURL(file_url, false); 806 // Make sure the serialized contents have META ; 807 ASSERT_TRUE(HasSerializedFrame(file_url)); 808 const std::string& serialized_contents = 809 GetSerializedContentForFrame(file_url); 810 811 // Reload serialized contents and make sure there is only one META tag. 812 LoadContents(serialized_contents, file_url, web_frame->encoding()); 813 web_frame = test_shell_->webView()->mainFrame(); 814 ASSERT_TRUE(web_frame != NULL); 815 doc = web_frame->document(); 816 ASSERT_TRUE(doc.isHTMLDocument()); 817 head_element = doc.head(); 818 ASSERT_TRUE(!head_element.isNull()); 819 ASSERT_TRUE(head_element.hasChildNodes()); 820 ASSERT_TRUE(head_element.childNodes().length() == 1); 821 WebNode meta_node = head_element.firstChild(); 822 ASSERT_TRUE(!meta_node.isNull()); 823 // Get meta charset info. 824 std::string charset_info; 825 ASSERT_TRUE(IsMetaElement(meta_node, charset_info)); 826 ASSERT_TRUE(!charset_info.empty()); 827 ASSERT_TRUE(charset_info == std::string(web_frame->encoding().utf8())); 828 829 // Check the body's first node is text node and its contents are 830 // "hello world" 831 WebElement body_element = doc.body(); 832 ASSERT_TRUE(!body_element.isNull()); 833 WebNode text_node = body_element.firstChild(); 834 ASSERT_TRUE(text_node.isTextNode()); 835 WebString text_node_contents = text_node.nodeValue(); 836 ASSERT_TRUE(std::string(text_node_contents.utf8()) == "hello world"); 837 } 838 839 // Test that we don't crash when the page contains an iframe that 840 // was handled as a download (http://crbug.com/42212). 841 TEST_F(DomSerializerTests, SerializeDocumentWithDownloadedIFrame) { 842 FilePath page_file_path = data_dir_; 843 page_file_path = page_file_path.AppendASCII("dom_serializer"); 844 page_file_path = page_file_path.AppendASCII("iframe-src-is-exe.htm"); 845 GURL file_url = net::FilePathToFileURL(page_file_path); 846 ASSERT_TRUE(file_url.SchemeIsFile()); 847 // Load the test file. 848 LoadPageFromURL(file_url); 849 // Do a recursive serialization. We pass if we don't crash. 850 SerializeDomForURL(file_url, true); 851 } 852 853 } // namespace 854