1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "base/bind.h" 6 #include "base/command_line.h" 7 #include "base/compiler_specific.h" 8 #include "base/containers/hash_tables.h" 9 #include "base/file_util.h" 10 #include "base/files/file_path.h" 11 #include "base/strings/string_util.h" 12 #include "base/strings/utf_string_conversions.h" 13 #include "content/public/common/content_switches.h" 14 #include "content/public/renderer/render_view.h" 15 #include "content/public/renderer/render_view_observer.h" 16 #include "content/public/test/test_utils.h" 17 #include "content/renderer/savable_resources.h" 18 #include "content/shell/browser/shell.h" 19 #include "content/test/content_browser_test.h" 20 #include "content/test/content_browser_test_utils.h" 21 #include "net/base/net_util.h" 22 #include "net/url_request/url_request_context.h" 23 #include "third_party/WebKit/public/platform/WebCString.h" 24 #include "third_party/WebKit/public/platform/WebData.h" 25 #include "third_party/WebKit/public/platform/WebString.h" 26 #include "third_party/WebKit/public/platform/WebURL.h" 27 #include "third_party/WebKit/public/platform/WebVector.h" 28 #include "third_party/WebKit/public/web/WebDocument.h" 29 #include "third_party/WebKit/public/web/WebElement.h" 30 #include "third_party/WebKit/public/web/WebFrame.h" 31 #include "third_party/WebKit/public/web/WebNode.h" 32 #include "third_party/WebKit/public/web/WebNodeCollection.h" 33 #include "third_party/WebKit/public/web/WebNodeList.h" 34 #include "third_party/WebKit/public/web/WebPageSerializer.h" 35 #include "third_party/WebKit/public/web/WebPageSerializerClient.h" 36 #include "third_party/WebKit/public/web/WebView.h" 37 38 using blink::WebCString; 39 using blink::WebData; 40 using blink::WebDocument; 41 using blink::WebElement; 42 using blink::WebFrame; 43 using blink::WebNode; 44 using blink::WebNodeCollection; 45 using blink::WebNodeList; 46 using blink::WebPageSerializer; 47 using blink::WebPageSerializerClient; 48 using blink::WebNode; 49 using blink::WebString; 50 using blink::WebURL; 51 using blink::WebView; 52 using blink::WebVector; 53 54 namespace content { 55 56 // Iterate recursively over sub-frames to find one with with a given url. 57 WebFrame* FindSubFrameByURL(WebView* web_view, const GURL& url) { 58 if (!web_view->mainFrame()) 59 return NULL; 60 61 std::vector<WebFrame*> stack; 62 stack.push_back(web_view->mainFrame()); 63 64 while (!stack.empty()) { 65 WebFrame* current_frame = stack.back(); 66 stack.pop_back(); 67 if (GURL(current_frame->document().url()) == url) 68 return current_frame; 69 WebNodeCollection all = current_frame->document().all(); 70 for (WebNode node = all.firstItem(); 71 !node.isNull(); node = all.nextItem()) { 72 if (!node.isElementNode()) 73 continue; 74 // Check frame tag and iframe tag 75 WebElement element = node.to<WebElement>(); 76 if (!element.hasTagName("frame") && !element.hasTagName("iframe")) 77 continue; 78 WebFrame* sub_frame = WebFrame::fromFrameOwnerElement(element); 79 if (sub_frame) 80 stack.push_back(sub_frame); 81 } 82 } 83 return NULL; 84 } 85 86 // Helper function that test whether the first node in the doc is a doc type 87 // node. 88 bool HasDocType(const WebDocument& doc) { 89 WebNode node = doc.firstChild(); 90 if (node.isNull()) 91 return false; 92 return node.nodeType() == WebNode::DocumentTypeNode; 93 } 94 95 // Helper function for checking whether input node is META tag. Return true 96 // means it is META element, otherwise return false. The parameter charset_info 97 // return actual charset info if the META tag has charset declaration. 98 bool IsMetaElement(const WebNode& node, std::string& charset_info) { 99 if (!node.isElementNode()) 100 return false; 101 const WebElement meta = node.toConst<WebElement>(); 102 if (!meta.hasTagName("meta")) 103 return false; 104 charset_info.erase(0, charset_info.length()); 105 // Check the META charset declaration. 106 WebString httpEquiv = meta.getAttribute("http-equiv"); 107 if (LowerCaseEqualsASCII(httpEquiv, "content-type")) { 108 std::string content = meta.getAttribute("content").utf8(); 109 int pos = content.find("charset", 0); 110 if (pos > -1) { 111 // Add a dummy charset declaration to charset_info, which indicates this 112 // META tag has charset declaration although we do not get correct value 113 // yet. 114 charset_info.append("has-charset-declaration"); 115 int remaining_length = content.length() - pos - 7; 116 if (!remaining_length) 117 return true; 118 int start_pos = pos + 7; 119 // Find "=" symbol. 120 while (remaining_length--) 121 if (content[start_pos++] == L'=') 122 break; 123 // Skip beginning space. 124 while (remaining_length) { 125 if (content[start_pos] > 0x0020) 126 break; 127 ++start_pos; 128 --remaining_length; 129 } 130 if (!remaining_length) 131 return true; 132 int end_pos = start_pos; 133 // Now we find out the start point of charset info. Search the end point. 134 while (remaining_length--) { 135 if (content[end_pos] <= 0x0020 || content[end_pos] == L';') 136 break; 137 ++end_pos; 138 } 139 // Get actual charset info. 140 charset_info = content.substr(start_pos, end_pos - start_pos); 141 return true; 142 } 143 } 144 return true; 145 } 146 147 class LoadObserver : public RenderViewObserver { 148 public: 149 LoadObserver(RenderView* render_view, const base::Closure& quit_closure) 150 : RenderViewObserver(render_view), 151 quit_closure_(quit_closure) {} 152 153 virtual void DidFinishLoad(blink::WebFrame* frame) OVERRIDE { 154 if (frame == render_view()->GetWebView()->mainFrame()) 155 quit_closure_.Run(); 156 } 157 158 private: 159 base::Closure quit_closure_; 160 }; 161 162 class DomSerializerTests : public ContentBrowserTest, 163 public WebPageSerializerClient { 164 public: 165 DomSerializerTests() 166 : serialized_(false), 167 local_directory_name_(FILE_PATH_LITERAL("./dummy_files/")) {} 168 169 virtual void SetUpCommandLine(CommandLine* command_line) OVERRIDE { 170 command_line->AppendSwitch(switches::kSingleProcess); 171 #if defined(OS_WIN) && defined(USE_AURA) 172 // Don't want to try to create a GPU process. 173 command_line->AppendSwitch(switches::kDisableAcceleratedCompositing); 174 #endif 175 } 176 177 // DomSerializerDelegate. 178 virtual void didSerializeDataForFrame(const WebURL& frame_web_url, 179 const WebCString& data, 180 PageSerializationStatus status) { 181 182 GURL frame_url(frame_web_url); 183 // If the all frames are finished saving, check all finish status 184 if (status == WebPageSerializerClient::AllFramesAreFinished) { 185 SerializationFinishStatusMap::iterator it = 186 serialization_finish_status_.begin(); 187 for (; it != serialization_finish_status_.end(); ++it) 188 ASSERT_TRUE(it->second); 189 serialized_ = true; 190 return; 191 } 192 193 // Check finish status of current frame. 194 SerializationFinishStatusMap::iterator it = 195 serialization_finish_status_.find(frame_url.spec()); 196 // New frame, set initial status as false. 197 if (it == serialization_finish_status_.end()) 198 serialization_finish_status_[frame_url.spec()] = false; 199 200 it = serialization_finish_status_.find(frame_url.spec()); 201 ASSERT_TRUE(it != serialization_finish_status_.end()); 202 // In process frame, finish status should be false. 203 ASSERT_FALSE(it->second); 204 205 // Add data to corresponding frame's content. 206 serialized_frame_map_[frame_url.spec()] += data.data(); 207 208 // Current frame is completed saving, change the finish status. 209 if (status == WebPageSerializerClient::CurrentFrameIsFinished) 210 it->second = true; 211 } 212 213 bool HasSerializedFrame(const GURL& frame_url) { 214 return serialized_frame_map_.find(frame_url.spec()) != 215 serialized_frame_map_.end(); 216 } 217 218 const std::string& GetSerializedContentForFrame( 219 const GURL& frame_url) { 220 return serialized_frame_map_[frame_url.spec()]; 221 } 222 223 RenderView* GetRenderView() { 224 // We could have the test on the UI thread get the WebContent's routing ID, 225 // but we know this will be the first RV so skip that and just hardcode it. 226 return RenderView::FromRoutingID(1); 227 } 228 229 WebView* GetWebView() { 230 return GetRenderView()->GetWebView(); 231 } 232 233 WebFrame* GetMainFrame() { 234 return GetWebView()->mainFrame(); 235 } 236 237 // Load web page according to input content and relative URLs within 238 // the document. 239 void LoadContents(const std::string& contents, 240 const GURL& base_url, 241 const WebString encoding_info) { 242 scoped_refptr<MessageLoopRunner> runner = new MessageLoopRunner; 243 LoadObserver observer(GetRenderView(), runner->QuitClosure()); 244 245 // If input encoding is empty, use UTF-8 as default encoding. 246 if (encoding_info.isEmpty()) { 247 GetMainFrame()->loadHTMLString(contents, base_url); 248 } else { 249 WebData data(contents.data(), contents.length()); 250 251 // Do not use WebFrame.LoadHTMLString because it assumes that input 252 // html contents use UTF-8 encoding. 253 // TODO(darin): This should use WebFrame::loadData. 254 WebFrame* web_frame = GetMainFrame(); 255 256 ASSERT_TRUE(web_frame != NULL); 257 258 web_frame->loadData(data, "text/html", encoding_info, base_url); 259 } 260 261 runner->Run(); 262 } 263 264 // Serialize page DOM according to specific page URL. The parameter 265 // recursive_serialization indicates whether we will serialize all 266 // sub-frames. 267 void SerializeDomForURL(const GURL& page_url, 268 bool recursive_serialization) { 269 // Find corresponding WebFrame according to page_url. 270 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), page_url); 271 ASSERT_TRUE(web_frame != NULL); 272 WebVector<WebURL> links; 273 links.assign(&page_url, 1); 274 WebString file_path = 275 base::FilePath(FILE_PATH_LITERAL("c:\\dummy.htm")).AsUTF16Unsafe(); 276 WebVector<WebString> local_paths; 277 local_paths.assign(&file_path, 1); 278 // Start serializing DOM. 279 bool result = WebPageSerializer::serialize(web_frame, 280 recursive_serialization, 281 static_cast<WebPageSerializerClient*>(this), 282 links, 283 local_paths, 284 local_directory_name_.AsUTF16Unsafe()); 285 ASSERT_TRUE(result); 286 ASSERT_TRUE(serialized_); 287 } 288 289 void SerializeHTMLDOMWithDocTypeOnRenderer(const GURL& file_url) { 290 // Make sure original contents have document type. 291 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 292 ASSERT_TRUE(web_frame != NULL); 293 WebDocument doc = web_frame->document(); 294 ASSERT_TRUE(HasDocType(doc)); 295 // Do serialization. 296 SerializeDomForURL(file_url, false); 297 // Load the serialized contents. 298 ASSERT_TRUE(HasSerializedFrame(file_url)); 299 const std::string& serialized_contents = 300 GetSerializedContentForFrame(file_url); 301 LoadContents(serialized_contents, file_url, 302 web_frame->document().encoding()); 303 // Make sure serialized contents still have document type. 304 web_frame = GetMainFrame(); 305 doc = web_frame->document(); 306 ASSERT_TRUE(HasDocType(doc)); 307 } 308 309 void SerializeHTMLDOMWithoutDocTypeOnRenderer(const GURL& file_url) { 310 // Make sure original contents do not have document type. 311 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 312 ASSERT_TRUE(web_frame != NULL); 313 WebDocument doc = web_frame->document(); 314 ASSERT_TRUE(!HasDocType(doc)); 315 // Do serialization. 316 SerializeDomForURL(file_url, false); 317 // Load the serialized contents. 318 ASSERT_TRUE(HasSerializedFrame(file_url)); 319 const std::string& serialized_contents = 320 GetSerializedContentForFrame(file_url); 321 LoadContents(serialized_contents, file_url, 322 web_frame->document().encoding()); 323 // Make sure serialized contents do not have document type. 324 web_frame = GetMainFrame(); 325 doc = web_frame->document(); 326 ASSERT_TRUE(!HasDocType(doc)); 327 } 328 329 void SerializeXMLDocWithBuiltInEntitiesOnRenderer( 330 const GURL& xml_file_url, const std::string& original_contents) { 331 // Do serialization. 332 SerializeDomForURL(xml_file_url, false); 333 // Compare the serialized contents with original contents. 334 ASSERT_TRUE(HasSerializedFrame(xml_file_url)); 335 const std::string& serialized_contents = 336 GetSerializedContentForFrame(xml_file_url); 337 ASSERT_EQ(original_contents, serialized_contents); 338 } 339 340 void SerializeHTMLDOMWithAddingMOTWOnRenderer( 341 const GURL& file_url, const std::string& original_contents) { 342 // Make sure original contents does not have MOTW; 343 std::string motw_declaration = 344 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8(); 345 ASSERT_FALSE(motw_declaration.empty()); 346 // The encoding of original contents is ISO-8859-1, so we convert the MOTW 347 // declaration to ASCII and search whether original contents has it or not. 348 ASSERT_TRUE(std::string::npos == original_contents.find(motw_declaration)); 349 350 // Do serialization. 351 SerializeDomForURL(file_url, false); 352 // Make sure the serialized contents have MOTW ; 353 ASSERT_TRUE(HasSerializedFrame(file_url)); 354 const std::string& serialized_contents = 355 GetSerializedContentForFrame(file_url); 356 ASSERT_FALSE(std::string::npos == 357 serialized_contents.find(motw_declaration)); 358 } 359 360 void SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer( 361 const GURL& file_url) { 362 // Make sure there is no META charset declaration in original document. 363 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 364 ASSERT_TRUE(web_frame != NULL); 365 WebDocument doc = web_frame->document(); 366 ASSERT_TRUE(doc.isHTMLDocument()); 367 WebElement head_element = doc.head(); 368 ASSERT_TRUE(!head_element.isNull()); 369 // Go through all children of HEAD element. 370 for (WebNode child = head_element.firstChild(); !child.isNull(); 371 child = child.nextSibling()) { 372 std::string charset_info; 373 if (IsMetaElement(child, charset_info)) 374 ASSERT_TRUE(charset_info.empty()); 375 } 376 // Do serialization. 377 SerializeDomForURL(file_url, false); 378 379 // Load the serialized contents. 380 ASSERT_TRUE(HasSerializedFrame(file_url)); 381 const std::string& serialized_contents = 382 GetSerializedContentForFrame(file_url); 383 LoadContents(serialized_contents, file_url, 384 web_frame->document().encoding()); 385 // Make sure the first child of HEAD element is META which has charset 386 // declaration in serialized contents. 387 web_frame = GetMainFrame(); 388 ASSERT_TRUE(web_frame != NULL); 389 doc = web_frame->document(); 390 ASSERT_TRUE(doc.isHTMLDocument()); 391 head_element = doc.head(); 392 ASSERT_TRUE(!head_element.isNull()); 393 WebNode meta_node = head_element.firstChild(); 394 ASSERT_TRUE(!meta_node.isNull()); 395 // Get meta charset info. 396 std::string charset_info2; 397 ASSERT_TRUE(IsMetaElement(meta_node, charset_info2)); 398 ASSERT_TRUE(!charset_info2.empty()); 399 ASSERT_EQ(charset_info2, 400 std::string(web_frame->document().encoding().utf8())); 401 402 // Make sure no more additional META tags which have charset declaration. 403 for (WebNode child = meta_node.nextSibling(); !child.isNull(); 404 child = child.nextSibling()) { 405 std::string charset_info; 406 if (IsMetaElement(child, charset_info)) 407 ASSERT_TRUE(charset_info.empty()); 408 } 409 } 410 411 void SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer( 412 const GURL& file_url) { 413 // Make sure there are multiple META charset declarations in original 414 // document. 415 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 416 ASSERT_TRUE(web_frame != NULL); 417 WebDocument doc = web_frame->document(); 418 ASSERT_TRUE(doc.isHTMLDocument()); 419 WebElement head_ele = doc.head(); 420 ASSERT_TRUE(!head_ele.isNull()); 421 // Go through all children of HEAD element. 422 int charset_declaration_count = 0; 423 for (WebNode child = head_ele.firstChild(); !child.isNull(); 424 child = child.nextSibling()) { 425 std::string charset_info; 426 if (IsMetaElement(child, charset_info) && !charset_info.empty()) 427 charset_declaration_count++; 428 } 429 // The original doc has more than META tags which have charset declaration. 430 ASSERT_TRUE(charset_declaration_count > 1); 431 432 // Do serialization. 433 SerializeDomForURL(file_url, false); 434 435 // Load the serialized contents. 436 ASSERT_TRUE(HasSerializedFrame(file_url)); 437 const std::string& serialized_contents = 438 GetSerializedContentForFrame(file_url); 439 LoadContents(serialized_contents, file_url, 440 web_frame->document().encoding()); 441 // Make sure only first child of HEAD element is META which has charset 442 // declaration in serialized contents. 443 web_frame = GetMainFrame(); 444 ASSERT_TRUE(web_frame != NULL); 445 doc = web_frame->document(); 446 ASSERT_TRUE(doc.isHTMLDocument()); 447 head_ele = doc.head(); 448 ASSERT_TRUE(!head_ele.isNull()); 449 WebNode meta_node = head_ele.firstChild(); 450 ASSERT_TRUE(!meta_node.isNull()); 451 // Get meta charset info. 452 std::string charset_info2; 453 ASSERT_TRUE(IsMetaElement(meta_node, charset_info2)); 454 ASSERT_TRUE(!charset_info2.empty()); 455 ASSERT_EQ(charset_info2, 456 std::string(web_frame->document().encoding().utf8())); 457 458 // Make sure no more additional META tags which have charset declaration. 459 for (WebNode child = meta_node.nextSibling(); !child.isNull(); 460 child = child.nextSibling()) { 461 std::string charset_info; 462 if (IsMetaElement(child, charset_info)) 463 ASSERT_TRUE(charset_info.empty()); 464 } 465 } 466 467 void SerializeHTMLDOMWithEntitiesInTextOnRenderer() { 468 base::FilePath page_file_path = GetTestFilePath( 469 "dom_serializer", "dom_serializer/htmlentities_in_text.htm"); 470 // Get file URL. The URL is dummy URL to identify the following loading 471 // actions. The test content is in constant:original_contents. 472 GURL file_url = net::FilePathToFileURL(page_file_path); 473 ASSERT_TRUE(file_url.SchemeIsFile()); 474 // Test contents. 475 static const char* const original_contents = 476 "<html><body>&<>\"\'</body></html>"; 477 // Load the test contents. 478 LoadContents(original_contents, file_url, WebString()); 479 480 // Get BODY's text content in DOM. 481 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 482 ASSERT_TRUE(web_frame != NULL); 483 WebDocument doc = web_frame->document(); 484 ASSERT_TRUE(doc.isHTMLDocument()); 485 WebElement body_ele = doc.body(); 486 ASSERT_TRUE(!body_ele.isNull()); 487 WebNode text_node = body_ele.firstChild(); 488 ASSERT_TRUE(text_node.isTextNode()); 489 ASSERT_TRUE(std::string(text_node.createMarkup().utf8()) == 490 "&<>\"\'"); 491 // Do serialization. 492 SerializeDomForURL(file_url, false); 493 // Compare the serialized contents with original contents. 494 ASSERT_TRUE(HasSerializedFrame(file_url)); 495 const std::string& serialized_contents = 496 GetSerializedContentForFrame(file_url); 497 // Compare the serialized contents with original contents to make sure 498 // they are same. 499 // Because we add MOTW when serializing DOM, so before comparison, we also 500 // need to add MOTW to original_contents. 501 std::string original_str = 502 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8(); 503 original_str += original_contents; 504 // Since WebCore now inserts a new HEAD element if there is no HEAD element 505 // when creating BODY element. (Please see 506 // HTMLParser::bodyCreateErrorCheck.) We need to append the HEAD content and 507 // corresponding META content if we find WebCore-generated HEAD element. 508 if (!doc.head().isNull()) { 509 WebString encoding = web_frame->document().encoding(); 510 std::string htmlTag("<html>"); 511 std::string::size_type pos = original_str.find(htmlTag); 512 ASSERT_NE(std::string::npos, pos); 513 pos += htmlTag.length(); 514 std::string head_part("<head>"); 515 head_part += 516 WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8(); 517 head_part += "</head>"; 518 original_str.insert(pos, head_part); 519 } 520 ASSERT_EQ(original_str, serialized_contents); 521 } 522 523 void SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer() { 524 base::FilePath page_file_path = GetTestFilePath( 525 "dom_serializer", "dom_serializer/htmlentities_in_attribute_value.htm"); 526 // Get file URL. The URL is dummy URL to identify the following loading 527 // actions. The test content is in constant:original_contents. 528 GURL file_url = net::FilePathToFileURL(page_file_path); 529 ASSERT_TRUE(file_url.SchemeIsFile()); 530 // Test contents. 531 static const char* const original_contents = 532 "<html><body title=\"&<>"'\"></body></html>"; 533 // Load the test contents. 534 LoadContents(original_contents, file_url, WebString()); 535 // Get value of BODY's title attribute in DOM. 536 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 537 ASSERT_TRUE(web_frame != NULL); 538 WebDocument doc = web_frame->document(); 539 ASSERT_TRUE(doc.isHTMLDocument()); 540 WebElement body_ele = doc.body(); 541 ASSERT_TRUE(!body_ele.isNull()); 542 WebString value = body_ele.getAttribute("title"); 543 ASSERT_TRUE(std::string(value.utf8()) == "&<>\"\'"); 544 // Do serialization. 545 SerializeDomForURL(file_url, false); 546 // Compare the serialized contents with original contents. 547 ASSERT_TRUE(HasSerializedFrame(file_url)); 548 const std::string& serialized_contents = 549 GetSerializedContentForFrame(file_url); 550 // Compare the serialized contents with original contents to make sure 551 // they are same. 552 std::string original_str = 553 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8(); 554 original_str += original_contents; 555 if (!doc.isNull()) { 556 WebString encoding = web_frame->document().encoding(); 557 std::string htmlTag("<html>"); 558 std::string::size_type pos = original_str.find(htmlTag); 559 ASSERT_NE(std::string::npos, pos); 560 pos += htmlTag.length(); 561 std::string head_part("<head>"); 562 head_part += 563 WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8(); 564 head_part += "</head>"; 565 original_str.insert(pos, head_part); 566 } 567 ASSERT_EQ(original_str, serialized_contents); 568 } 569 570 void SerializeHTMLDOMWithNonStandardEntitiesOnRenderer(const GURL& file_url) { 571 // Get value of BODY's title attribute in DOM. 572 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 573 WebDocument doc = web_frame->document(); 574 ASSERT_TRUE(doc.isHTMLDocument()); 575 WebElement body_element = doc.body(); 576 // Unescaped string for "%⊅¹'". 577 static const wchar_t parsed_value[] = { 578 '%', 0x2285, 0x00b9, '\'', 0 579 }; 580 WebString value = body_element.getAttribute("title"); 581 ASSERT_TRUE(UTF16ToWide(value) == parsed_value); 582 ASSERT_TRUE(UTF16ToWide(body_element.innerText()) == parsed_value); 583 584 // Do serialization. 585 SerializeDomForURL(file_url, false); 586 // Check the serialized string. 587 ASSERT_TRUE(HasSerializedFrame(file_url)); 588 const std::string& serialized_contents = 589 GetSerializedContentForFrame(file_url); 590 // Confirm that the serialized string has no non-standard HTML entities. 591 ASSERT_EQ(std::string::npos, serialized_contents.find("%")); 592 ASSERT_EQ(std::string::npos, serialized_contents.find("⊅")); 593 ASSERT_EQ(std::string::npos, serialized_contents.find("¹")); 594 ASSERT_EQ(std::string::npos, serialized_contents.find("'")); 595 } 596 597 void SerializeHTMLDOMWithBaseTagOnRenderer(const GURL& file_url, 598 const GURL& path_dir_url) { 599 // There are total 2 available base tags in this test file. 600 const int kTotalBaseTagCountInTestFile = 2; 601 602 // Since for this test, we assume there is no savable sub-resource links for 603 // this test file, also all links are relative URLs in this test file, so we 604 // need to check those relative URLs and make sure document has BASE tag. 605 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 606 ASSERT_TRUE(web_frame != NULL); 607 WebDocument doc = web_frame->document(); 608 ASSERT_TRUE(doc.isHTMLDocument()); 609 // Go through all descent nodes. 610 WebNodeCollection all = doc.all(); 611 int original_base_tag_count = 0; 612 for (WebNode node = all.firstItem(); !node.isNull(); 613 node = all.nextItem()) { 614 if (!node.isElementNode()) 615 continue; 616 WebElement element = node.to<WebElement>(); 617 if (element.hasTagName("base")) { 618 original_base_tag_count++; 619 } else { 620 // Get link. 621 WebString value = GetSubResourceLinkFromElement(element); 622 if (value.isNull() && element.hasTagName("a")) { 623 value = element.getAttribute("href"); 624 if (value.isEmpty()) 625 value = WebString(); 626 } 627 // Each link is relative link. 628 if (!value.isNull()) { 629 GURL link(value.utf8()); 630 ASSERT_TRUE(link.scheme().empty()); 631 } 632 } 633 } 634 ASSERT_EQ(original_base_tag_count, kTotalBaseTagCountInTestFile); 635 // Make sure in original document, the base URL is not equal with the 636 // |path_dir_url|. 637 GURL original_base_url(doc.baseURL()); 638 ASSERT_NE(original_base_url, path_dir_url); 639 640 // Do serialization. 641 SerializeDomForURL(file_url, false); 642 643 // Load the serialized contents. 644 ASSERT_TRUE(HasSerializedFrame(file_url)); 645 const std::string& serialized_contents = 646 GetSerializedContentForFrame(file_url); 647 LoadContents(serialized_contents, file_url, 648 web_frame->document().encoding()); 649 650 // Make sure all links are absolute URLs and doc there are some number of 651 // BASE tags in serialized HTML data. Each of those BASE tags have same base 652 // URL which is as same as URL of current test file. 653 web_frame = GetMainFrame(); 654 ASSERT_TRUE(web_frame != NULL); 655 doc = web_frame->document(); 656 ASSERT_TRUE(doc.isHTMLDocument()); 657 // Go through all descent nodes. 658 all = doc.all(); 659 int new_base_tag_count = 0; 660 for (WebNode node = all.firstItem(); !node.isNull(); 661 node = all.nextItem()) { 662 if (!node.isElementNode()) 663 continue; 664 WebElement element = node.to<WebElement>(); 665 if (element.hasTagName("base")) { 666 new_base_tag_count++; 667 } else { 668 // Get link. 669 WebString value = GetSubResourceLinkFromElement(element); 670 if (value.isNull() && element.hasTagName("a")) { 671 value = element.getAttribute("href"); 672 if (value.isEmpty()) 673 value = WebString(); 674 } 675 // Each link is absolute link. 676 if (!value.isNull()) { 677 GURL link(std::string(value.utf8())); 678 ASSERT_FALSE(link.scheme().empty()); 679 } 680 } 681 } 682 // We have one more added BASE tag which is generated by JavaScript. 683 ASSERT_EQ(new_base_tag_count, original_base_tag_count + 1); 684 // Make sure in new document, the base URL is equal with the |path_dir_url|. 685 GURL new_base_url(doc.baseURL()); 686 ASSERT_EQ(new_base_url, path_dir_url); 687 } 688 689 void SerializeHTMLDOMWithEmptyHeadOnRenderer() { 690 base::FilePath page_file_path = GetTestFilePath( 691 "dom_serializer", "empty_head.htm"); 692 GURL file_url = net::FilePathToFileURL(page_file_path); 693 ASSERT_TRUE(file_url.SchemeIsFile()); 694 695 // Load the test html content. 696 static const char* const empty_head_contents = 697 "<html><head></head><body>hello world</body></html>"; 698 LoadContents(empty_head_contents, file_url, WebString()); 699 700 // Make sure the head tag is empty. 701 WebFrame* web_frame = GetMainFrame(); 702 ASSERT_TRUE(web_frame != NULL); 703 WebDocument doc = web_frame->document(); 704 ASSERT_TRUE(doc.isHTMLDocument()); 705 WebElement head_element = doc.head(); 706 ASSERT_TRUE(!head_element.isNull()); 707 ASSERT_TRUE(!head_element.hasChildNodes()); 708 ASSERT_TRUE(head_element.childNodes().length() == 0); 709 710 // Do serialization. 711 SerializeDomForURL(file_url, false); 712 // Make sure the serialized contents have META ; 713 ASSERT_TRUE(HasSerializedFrame(file_url)); 714 const std::string& serialized_contents = 715 GetSerializedContentForFrame(file_url); 716 717 // Reload serialized contents and make sure there is only one META tag. 718 LoadContents(serialized_contents, file_url, 719 web_frame->document().encoding()); 720 web_frame = GetMainFrame(); 721 ASSERT_TRUE(web_frame != NULL); 722 doc = web_frame->document(); 723 ASSERT_TRUE(doc.isHTMLDocument()); 724 head_element = doc.head(); 725 ASSERT_TRUE(!head_element.isNull()); 726 ASSERT_TRUE(head_element.hasChildNodes()); 727 ASSERT_TRUE(head_element.childNodes().length() == 1); 728 WebNode meta_node = head_element.firstChild(); 729 ASSERT_TRUE(!meta_node.isNull()); 730 // Get meta charset info. 731 std::string charset_info; 732 ASSERT_TRUE(IsMetaElement(meta_node, charset_info)); 733 ASSERT_TRUE(!charset_info.empty()); 734 ASSERT_EQ(charset_info, 735 std::string(web_frame->document().encoding().utf8())); 736 737 // Check the body's first node is text node and its contents are 738 // "hello world" 739 WebElement body_element = doc.body(); 740 ASSERT_TRUE(!body_element.isNull()); 741 WebNode text_node = body_element.firstChild(); 742 ASSERT_TRUE(text_node.isTextNode()); 743 WebString text_node_contents = text_node.nodeValue(); 744 ASSERT_TRUE(std::string(text_node_contents.utf8()) == "hello world"); 745 } 746 747 void SerializeDocumentWithDownloadedIFrameOnRenderer(const GURL& file_url) { 748 // Do a recursive serialization. We pass if we don't crash. 749 SerializeDomForURL(file_url, true); 750 } 751 752 void SubResourceForElementsInNonHTMLNamespaceOnRenderer( 753 const GURL& file_url) { 754 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 755 ASSERT_TRUE(web_frame != NULL); 756 WebDocument doc = web_frame->document(); 757 WebNode lastNodeInBody = doc.body().lastChild(); 758 ASSERT_EQ(WebNode::ElementNode, lastNodeInBody.nodeType()); 759 WebString uri = GetSubResourceLinkFromElement( 760 lastNodeInBody.to<WebElement>()); 761 EXPECT_TRUE(uri.isNull()); 762 } 763 764 private: 765 // Map frame_url to corresponding serialized_content. 766 typedef base::hash_map<std::string, std::string> SerializedFrameContentMap; 767 SerializedFrameContentMap serialized_frame_map_; 768 // Map frame_url to corresponding status of serialization finish. 769 typedef base::hash_map<std::string, bool> SerializationFinishStatusMap; 770 SerializationFinishStatusMap serialization_finish_status_; 771 // Flag indicates whether the process of serializing DOM is finished or not. 772 bool serialized_; 773 // The local_directory_name_ is dummy relative path of directory which 774 // contain all saved auxiliary files included all sub frames and resources. 775 const base::FilePath local_directory_name_; 776 }; 777 778 // If original contents have document type, the serialized contents also have 779 // document type. 780 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithDocType) { 781 base::FilePath page_file_path = 782 GetTestFilePath("dom_serializer", "youtube_1.htm"); 783 GURL file_url = net::FilePathToFileURL(page_file_path); 784 ASSERT_TRUE(file_url.SchemeIsFile()); 785 // Load the test file. 786 NavigateToURL(shell(), file_url); 787 788 PostTaskToInProcessRendererAndWait( 789 base::Bind(&DomSerializerTests::SerializeHTMLDOMWithDocTypeOnRenderer, 790 base::Unretained(this), file_url)); 791 } 792 793 // If original contents do not have document type, the serialized contents 794 // also do not have document type. 795 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithoutDocType) { 796 base::FilePath page_file_path = 797 GetTestFilePath("dom_serializer", "youtube_2.htm"); 798 GURL file_url = net::FilePathToFileURL(page_file_path); 799 ASSERT_TRUE(file_url.SchemeIsFile()); 800 // Load the test file. 801 NavigateToURL(shell(), file_url); 802 803 PostTaskToInProcessRendererAndWait( 804 base::Bind( 805 &DomSerializerTests::SerializeHTMLDOMWithoutDocTypeOnRenderer, 806 base::Unretained(this), file_url)); 807 } 808 809 // Serialize XML document which has all 5 built-in entities. After 810 // finishing serialization, the serialized contents should be same 811 // with original XML document. 812 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeXMLDocWithBuiltInEntities) { 813 base::FilePath page_file_path = 814 GetTestFilePath("dom_serializer", "note.html"); 815 base::FilePath xml_file_path = GetTestFilePath("dom_serializer", "note.xml"); 816 // Read original contents for later comparison. 817 std::string original_contents; 818 ASSERT_TRUE(base::ReadFileToString(xml_file_path, &original_contents)); 819 // Get file URL. 820 GURL file_url = net::FilePathToFileURL(page_file_path); 821 GURL xml_file_url = net::FilePathToFileURL(xml_file_path); 822 ASSERT_TRUE(file_url.SchemeIsFile()); 823 // Load the test file. 824 NavigateToURL(shell(), file_url); 825 826 PostTaskToInProcessRendererAndWait( 827 base::Bind( 828 &DomSerializerTests::SerializeXMLDocWithBuiltInEntitiesOnRenderer, 829 base::Unretained(this), xml_file_url, original_contents)); 830 } 831 832 // When serializing DOM, we add MOTW declaration before html tag. 833 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithAddingMOTW) { 834 base::FilePath page_file_path = 835 GetTestFilePath("dom_serializer", "youtube_2.htm"); 836 // Read original contents for later comparison . 837 std::string original_contents; 838 ASSERT_TRUE(base::ReadFileToString(page_file_path, &original_contents)); 839 // Get file URL. 840 GURL file_url = net::FilePathToFileURL(page_file_path); 841 ASSERT_TRUE(file_url.SchemeIsFile()); 842 843 // Load the test file. 844 NavigateToURL(shell(), file_url); 845 846 PostTaskToInProcessRendererAndWait( 847 base::Bind( 848 &DomSerializerTests::SerializeHTMLDOMWithAddingMOTWOnRenderer, 849 base::Unretained(this), file_url, original_contents)); 850 } 851 852 // When serializing DOM, we will add the META which have correct charset 853 // declaration as first child of HEAD element for resolving WebKit bug: 854 // http://bugs.webkit.org/show_bug.cgi?id=16621 even the original document 855 // does not have META charset declaration. 856 IN_PROC_BROWSER_TEST_F(DomSerializerTests, 857 SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc) { 858 base::FilePath page_file_path = 859 GetTestFilePath("dom_serializer", "youtube_1.htm"); 860 // Get file URL. 861 GURL file_url = net::FilePathToFileURL(page_file_path); 862 ASSERT_TRUE(file_url.SchemeIsFile()); 863 // Load the test file. 864 NavigateToURL(shell(), file_url); 865 866 PostTaskToInProcessRendererAndWait( 867 base::Bind( 868 &DomSerializerTests:: 869 SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer, 870 base::Unretained(this), file_url)); 871 } 872 873 // When serializing DOM, if the original document has multiple META charset 874 // declaration, we will add the META which have correct charset declaration 875 // as first child of HEAD element and remove all original META charset 876 // declarations. 877 IN_PROC_BROWSER_TEST_F(DomSerializerTests, 878 SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDoc) { 879 base::FilePath page_file_path = 880 GetTestFilePath("dom_serializer", "youtube_2.htm"); 881 // Get file URL. 882 GURL file_url = net::FilePathToFileURL(page_file_path); 883 ASSERT_TRUE(file_url.SchemeIsFile()); 884 // Load the test file. 885 NavigateToURL(shell(), file_url); 886 887 PostTaskToInProcessRendererAndWait( 888 base::Bind( 889 &DomSerializerTests:: 890 SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer, 891 base::Unretained(this), file_url)); 892 } 893 894 // Test situation of html entities in text when serializing HTML DOM. 895 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithEntitiesInText) { 896 // Need to spin up the renderer and also navigate to a file url so that the 897 // renderer code doesn't attempt a fork when it sees a load to file scheme 898 // from non-file scheme. 899 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html")); 900 901 PostTaskToInProcessRendererAndWait( 902 base::Bind( 903 &DomSerializerTests::SerializeHTMLDOMWithEntitiesInTextOnRenderer, 904 base::Unretained(this))); 905 } 906 907 // Test situation of html entities in attribute value when serializing 908 // HTML DOM. 909 // This test started to fail at WebKit r65388. See http://crbug.com/52279. 910 // 911 // TODO(tiger (at) opera.com): Disabled in preparation of page serializer merge -- 912 // Some attributes are handled differently in the merged serializer. 913 // Bug: http://crbug.com/328354 914 IN_PROC_BROWSER_TEST_F(DomSerializerTests, 915 DISABLE_SerializeHTMLDOMWithEntitiesInAttributeValue) { 916 // Need to spin up the renderer and also navigate to a file url so that the 917 // renderer code doesn't attempt a fork when it sees a load to file scheme 918 // from non-file scheme. 919 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html")); 920 921 PostTaskToInProcessRendererAndWait( 922 base::Bind( 923 &DomSerializerTests:: 924 SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer, 925 base::Unretained(this))); 926 } 927 928 // Test situation of non-standard HTML entities when serializing HTML DOM. 929 // This test started to fail at WebKit r65351. See http://crbug.com/52279. 930 IN_PROC_BROWSER_TEST_F(DomSerializerTests, 931 SerializeHTMLDOMWithNonStandardEntities) { 932 // Make a test file URL and load it. 933 base::FilePath page_file_path = GetTestFilePath( 934 "dom_serializer", "nonstandard_htmlentities.htm"); 935 GURL file_url = net::FilePathToFileURL(page_file_path); 936 NavigateToURL(shell(), file_url); 937 938 PostTaskToInProcessRendererAndWait( 939 base::Bind( 940 &DomSerializerTests:: 941 SerializeHTMLDOMWithNonStandardEntitiesOnRenderer, 942 base::Unretained(this), file_url)); 943 } 944 945 // Test situation of BASE tag in original document when serializing HTML DOM. 946 // When serializing, we should comment the BASE tag, append a new BASE tag. 947 // rewrite all the savable URLs to relative local path, and change other URLs 948 // to absolute URLs. 949 // 950 // TODO(tiger (at) opera.com): Disabled in preparation of page serializer merge -- 951 // Base tags are handled a bit different in merged version. 952 // Bug: http://crbug.com/328354 953 IN_PROC_BROWSER_TEST_F(DomSerializerTests, 954 DISABLE_SerializeHTMLDOMWithBaseTag) { 955 base::FilePath page_file_path = GetTestFilePath( 956 "dom_serializer", "html_doc_has_base_tag.htm"); 957 958 // Get page dir URL which is base URL of this file. 959 base::FilePath dir_name = page_file_path.DirName(); 960 dir_name = dir_name.Append( 961 base::FilePath::StringType(base::FilePath::kSeparators[0], 1)); 962 GURL path_dir_url = net::FilePathToFileURL(dir_name); 963 964 // Get file URL. 965 GURL file_url = net::FilePathToFileURL(page_file_path); 966 ASSERT_TRUE(file_url.SchemeIsFile()); 967 // Load the test file. 968 NavigateToURL(shell(), file_url); 969 970 PostTaskToInProcessRendererAndWait( 971 base::Bind( 972 &DomSerializerTests::SerializeHTMLDOMWithBaseTagOnRenderer, 973 base::Unretained(this), file_url, path_dir_url)); 974 } 975 976 // Serializing page which has an empty HEAD tag. 977 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithEmptyHead) { 978 // Need to spin up the renderer and also navigate to a file url so that the 979 // renderer code doesn't attempt a fork when it sees a load to file scheme 980 // from non-file scheme. 981 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html")); 982 983 PostTaskToInProcessRendererAndWait( 984 base::Bind(&DomSerializerTests::SerializeHTMLDOMWithEmptyHeadOnRenderer, 985 base::Unretained(this))); 986 } 987 988 // Test that we don't crash when the page contains an iframe that 989 // was handled as a download (http://crbug.com/42212). 990 IN_PROC_BROWSER_TEST_F(DomSerializerTests, 991 SerializeDocumentWithDownloadedIFrame) { 992 base::FilePath page_file_path = GetTestFilePath( 993 "dom_serializer", "iframe-src-is-exe.htm"); 994 GURL file_url = net::FilePathToFileURL(page_file_path); 995 ASSERT_TRUE(file_url.SchemeIsFile()); 996 // Load the test file. 997 NavigateToURL(shell(), file_url); 998 999 PostTaskToInProcessRendererAndWait( 1000 base::Bind( 1001 &DomSerializerTests:: 1002 SerializeDocumentWithDownloadedIFrameOnRenderer, 1003 base::Unretained(this), file_url)); 1004 } 1005 1006 IN_PROC_BROWSER_TEST_F(DomSerializerTests, 1007 SubResourceForElementsInNonHTMLNamespace) { 1008 base::FilePath page_file_path = GetTestFilePath( 1009 "dom_serializer", "non_html_namespace.htm"); 1010 GURL file_url = net::FilePathToFileURL(page_file_path); 1011 NavigateToURL(shell(), file_url); 1012 1013 PostTaskToInProcessRendererAndWait( 1014 base::Bind( 1015 &DomSerializerTests:: 1016 SubResourceForElementsInNonHTMLNamespaceOnRenderer, 1017 base::Unretained(this), file_url)); 1018 } 1019 1020 } // namespace content 1021